Python is_fsspec_url 예제들, pandas.io.common.is_fsspec_url Python 예제들

예제 #1

0

파일 보기

    def read(
        self, path, columns=None, storage_options: StorageOptions = None, **kwargs
    ):
        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
        if use_nullable_dtypes:
            raise ValueError(
                "The 'use_nullable_dtypes' argument is not supported for the "
                "fastparquet engine"
            )
        path = stringify_path(path)
        parquet_kwargs = {}
        handles = None
        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            parquet_kwargs["open_with"] = lambda path, _: fsspec.open(
                path, "rb", **(storage_options or {})
            ).open()
        elif isinstance(path, str) and not os.path.isdir(path):
            # use get_handle only when we are very certain that it is not a directory
            # fsspec resources can also point to directories
            # this branch is used for example when reading from non-fsspec URLs
            handles = get_handle(path, "rb", is_text=False)
            path = handles.handle
        parquet_file = self.api.ParquetFile(path, **parquet_kwargs)

        result = parquet_file.to_pandas(columns=columns, **kwargs)

        if handles is not None:
            handles.close()
        return result

예제 #2

0

파일 보기

파일: parquet.py 프로젝트: wvigiloliver/pandas

    def read(
        self, path, columns=None, storage_options: StorageOptions = None, **kwargs
    ):
        if is_fsspec_url(path) and "filesystem" not in kwargs:
            import_optional_dependency("fsspec")
            import fsspec.core

            fs, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
            should_close = False
        else:
            if storage_options:
                raise ValueError(
                    "storage_options passed with buffer or non-fsspec filepath"
                )
            fs = kwargs.pop("filesystem", None)
            should_close = False
            path = _expand_user(path)

        if not fs:
            path, _, _, should_close = get_filepath_or_buffer(path)

        kwargs["use_pandas_metadata"] = True
        result = self.api.parquet.read_table(
            path, columns=columns, filesystem=fs, **kwargs
        ).to_pandas()
        if should_close:
            path.close()

        return result

예제 #3

0

파일 보기

def _get_path_or_handle(
    path: FilePathOrBuffer,
    fs: Any,
    storage_options: StorageOptions = None,
    mode: str = "rb",
    is_dir: bool = False,
) -> tuple[FilePathOrBuffer, IOHandles | None, Any]:
    """File handling for PyArrow."""
    path_or_handle = stringify_path(path)
    if is_fsspec_url(path_or_handle) and fs is None:
        fsspec = import_optional_dependency("fsspec")

        fs, path_or_handle = fsspec.core.url_to_fs(path_or_handle,
                                                   **(storage_options or {}))
    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
        # can't write to a remote url
        # without making use of fsspec at the moment
        raise ValueError(
            "storage_options passed with buffer, or non-supported URL")

    handles = None
    if (not fs and not is_dir and isinstance(path_or_handle, str)
            and not os.path.isdir(path_or_handle)):
        # use get_handle only when we are very certain that it is not a directory
        # fsspec resources can also point to directories
        # this branch is used for example when reading from non-fsspec URLs
        handles = get_handle(path_or_handle,
                             mode,
                             is_text=False,
                             storage_options=storage_options)
        fs = None
        path_or_handle = handles.handle
    return path_or_handle, handles, fs

예제 #4

0

파일 보기

파일: _json.py 프로젝트: llawall/pandas

    def _get_data_from_filepath(self, filepath_or_buffer):
        """
        The function read_json accepts three input types:
            1. filepath (string-like)
            2. file-like object (e.g. open file object, StringIO)
            3. JSON string

        This method turns (1) into (2) to simplify the rest of the processing.
        It returns input types (2) and (3) unchanged.
        """
        # if it is a string but the file does not exist, it might be a JSON string
        filepath_or_buffer = stringify_path(filepath_or_buffer)
        if (not isinstance(filepath_or_buffer, str)
                or is_url(filepath_or_buffer)
                or is_fsspec_url(filepath_or_buffer)
                or file_exists(filepath_or_buffer)):
            self.handles = get_handle(
                filepath_or_buffer,
                "r",
                encoding=self.encoding,
                compression=self.compression,
                storage_options=self.storage_options,
                errors=self.encoding_errors,
            )
            filepath_or_buffer = self.handles.handle

        return filepath_or_buffer

예제 #5

0

파일 보기

파일: test_common.py 프로젝트: wkerzendorf/pandas

def test_is_fsspec_url():
    assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")
    assert icom.is_fsspec_url("gs://pandas/somethingelse.com")
    # the following is the only remote URL that is handled without fsspec
    assert not icom.is_fsspec_url("http://pandas/somethingelse.com")
    assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
    assert not icom.is_fsspec_url("/local/path")
    assert not icom.is_fsspec_url("relative/local/path")
    # fsspec URL in string should not be recognized
    assert not icom.is_fsspec_url("this is not fsspec://url")
    assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}")
    # accept everything that conforms to RFC 3986 schema
    assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")

예제 #6

0

파일 보기

파일: parquet.py 프로젝트: praful-pra1/Machine-Learning-GMCA

    def read(self, path, columns=None, **kwargs):
        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            open_with = lambda path, _: fsspec.open(path, "rb").open()
            parquet_file = self.api.ParquetFile(path, open_with=open_with)
        else:
            path, _, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)

예제 #7

0

파일 보기

 def _check_source_format(self, src):
     src = self.source
     if is_url(src):
         fmt = 'url'
     elif is_file_like(src):
         fmt = 'filelike'
     elif is_fsspec_url(src):
         fmt = 's3'
     else:
         fmt = 'invalid'
     return fmt

예제 #8

0

파일 보기

파일: test_common.py 프로젝트: yupengfei8421/pandas

def test_is_fsspec_url():
    assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")
    assert icom.is_fsspec_url("gs://pandas/somethingelse.com")
    # the following is the only remote URL that is handled without fsspec
    assert not icom.is_fsspec_url("http://pandas/somethingelse.com")
    assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
    assert not icom.is_fsspec_url("/local/path")
    assert not icom.is_fsspec_url("relative/local/path")

예제 #9

0

파일 보기

    def write(
        self,
        df: DataFrame,
        path: FilePathOrBuffer[AnyStr],
        compression: Optional[str] = "snappy",
        index: Optional[bool] = None,
        storage_options: StorageOptions = None,
        partition_cols: Optional[List[str]] = None,
        **kwargs,
    ):
        self.validate_dataframe(df)

        from_pandas_kwargs: Dict[str, Any] = {
            "schema": kwargs.pop("schema", None)
        }
        if index is not None:
            from_pandas_kwargs["preserve_index"] = index

        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)

        path = stringify_path(path)
        # get_handle could be used here (for write_table, not for write_to_dataset)
        # but it would complicate the code.
        if is_fsspec_url(path) and "filesystem" not in kwargs:
            # make fsspec instance, which pyarrow will use to open paths
            fsspec = import_optional_dependency("fsspec")

            fs, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
            kwargs["filesystem"] = fs

        elif storage_options:
            raise ValueError(
                "storage_options passed with file object or non-fsspec file path"
            )

        if partition_cols is not None:
            # writes to multiple files under the given path
            self.api.parquet.write_to_dataset(
                table,
                path,
                compression=compression,
                partition_cols=partition_cols,
                **kwargs,
            )
        else:
            # write to single output file
            self.api.parquet.write_table(table,
                                         path,
                                         compression=compression,
                                         **kwargs)

예제 #10

0

파일 보기

    def write(
        self,
        df: DataFrame,
        path,
        compression="snappy",
        index=None,
        partition_cols=None,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if "partition_on" in kwargs and partition_cols is not None:
            raise ValueError(
                "Cannot use both partition_on and "
                "partition_cols. Use partition_cols for partitioning data"
            )
        elif "partition_on" in kwargs:
            partition_cols = kwargs.pop("partition_on")

        if partition_cols is not None:
            kwargs["file_scheme"] = "hive"

        # cannot use get_handle as write() does not accept file buffers
        path = stringify_path(path)
        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
            kwargs["open_with"] = lambda path, _: fsspec.open(
                path, "wb", **(storage_options or {})
            ).open()
        elif storage_options:
            raise ValueError(
                "storage_options passed with file object or non-fsspec file path"
            )

        with catch_warnings(record=True):
            self.api.write(
                path,
                df,
                compression=compression,
                write_index=index,
                partition_on=partition_cols,
                **kwargs,
            )

예제 #11

0

파일 보기

파일: parquet.py 프로젝트: ArtinSarraf/pandas

    def read(
        self, path, columns=None, storage_options: StorageOptions = None, **kwargs
    ):
        parquet_kwargs = {}
        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
        # Technically works with 0.7.0, but was incorrect
        # so lets just require 0.7.1
        if Version(self.api.__version__) >= Version("0.7.1"):
            # Need to set even for use_nullable_dtypes = False,
            # since our defaults differ
            parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
        else:
            if use_nullable_dtypes:
                raise ValueError(
                    "The 'use_nullable_dtypes' argument is not supported for the "
                    "fastparquet engine for fastparquet versions less than 0.7.1"
                )
        path = stringify_path(path)
        handles = None
        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            if Version(self.api.__version__) > Version("0.6.1"):
                parquet_kwargs["fs"] = fsspec.open(
                    path, "rb", **(storage_options or {})
                ).fs
            else:
                parquet_kwargs["open_with"] = lambda path, _: fsspec.open(
                    path, "rb", **(storage_options or {})
                ).open()
        elif isinstance(path, str) and not os.path.isdir(path):
            # use get_handle only when we are very certain that it is not a directory
            # fsspec resources can also point to directories
            # this branch is used for example when reading from non-fsspec URLs
            handles = get_handle(
                path, "rb", is_text=False, storage_options=storage_options
            )
            path = handles.handle

        parquet_file = self.api.ParquetFile(path, **parquet_kwargs)

        result = parquet_file.to_pandas(columns=columns, **kwargs)

        if handles is not None:
            handles.close()
        return result

예제 #12

0

파일 보기

def get_data_from_filepath(
    filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
    encoding,
    compression: CompressionOptions,
    storage_options: StorageOptions,
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
    """
    Extract raw XML data.

    The method accepts three input types:
        1. filepath (string-like)
        2. file-like object (e.g. open file object, StringIO)
        3. XML string or bytes

    This method turns (1) into (2) to simplify the rest of the processing.
    It returns input types (2) and (3) unchanged.
    """
    if not isinstance(filepath_or_buffer, bytes):
        filepath_or_buffer = stringify_path(filepath_or_buffer)

    if (
        isinstance(filepath_or_buffer, str)
        and not filepath_or_buffer.startswith(("<?xml", "<"))
    ) and (
        not isinstance(filepath_or_buffer, str)
        or is_url(filepath_or_buffer)
        or is_fsspec_url(filepath_or_buffer)
        or file_exists(filepath_or_buffer)
    ):
        with get_handle(
            filepath_or_buffer,
            "r",
            encoding=encoding,
            compression=compression,
            storage_options=storage_options,
        ) as handle_obj:
            filepath_or_buffer = (
                # error: Incompatible types in assignment (expression has type
                # "Union[str, IO[str]]", variable has type "Union[Union[str,
                # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
                handle_obj.handle.read()  # type: ignore[assignment]
                if hasattr(handle_obj.handle, "read")
                else handle_obj.handle
            )

    return filepath_or_buffer

예제 #13

0

파일 보기

파일: parquet.py 프로젝트: praful-pra1/Machine-Learning-GMCA

    def write(
        self,
        df: DataFrame,
        path: FilePathOrBuffer[AnyStr],
        compression: Optional[str] = "snappy",
        index: Optional[bool] = None,
        partition_cols: Optional[List[str]] = None,
        **kwargs,
    ):
        self.validate_dataframe(df)

        from_pandas_kwargs: Dict[str, Any] = {
            "schema": kwargs.pop("schema", None)
        }
        if index is not None:
            from_pandas_kwargs["preserve_index"] = index

        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)

        if is_fsspec_url(path) and "filesystem" not in kwargs:
            # make fsspec instance, which pyarrow will use to open paths
            import_optional_dependency("fsspec")
            import fsspec.core

            fs, path = fsspec.core.url_to_fs(path)
            kwargs["filesystem"] = fs
        else:
            path = _expand_user(path)
        if partition_cols is not None:
            # writes to multiple files under the given path
            self.api.parquet.write_to_dataset(
                table,
                path,
                compression=compression,
                partition_cols=partition_cols,
                **kwargs,
            )
        else:
            # write to single output file
            self.api.parquet.write_table(table,
                                         path,
                                         compression=compression,
                                         **kwargs)

예제 #14

0

파일 보기

파일: parquet.py 프로젝트: Jeitan/pandas

    def read(self,
             path,
             columns=None,
             storage_options: StorageOptions = None,
             **kwargs) -> DataFrame:
        parquet_kwargs: dict[str, Any] = {}
        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
        if Version(self.api.__version__) >= Version("0.7.1"):
            # We are disabling nullable dtypes for fastparquet pending discussion
            parquet_kwargs["pandas_nulls"] = False
        if use_nullable_dtypes:
            raise ValueError(
                "The 'use_nullable_dtypes' argument is not supported for the "
                "fastparquet engine")
        path = stringify_path(path)
        handles = None
        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            if Version(self.api.__version__) > Version("0.6.1"):
                parquet_kwargs["fs"] = fsspec.open(path, "rb",
                                                   **(storage_options
                                                      or {})).fs
            else:
                parquet_kwargs["open_with"] = lambda path, _: fsspec.open(
                    path, "rb", **(storage_options or {})).open()
        elif isinstance(path, str) and not os.path.isdir(path):
            # use get_handle only when we are very certain that it is not a directory
            # fsspec resources can also point to directories
            # this branch is used for example when reading from non-fsspec URLs
            handles = get_handle(path,
                                 "rb",
                                 is_text=False,
                                 storage_options=storage_options)
            path = handles.handle

        try:
            parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
            return parquet_file.to_pandas(columns=columns, **kwargs)
        finally:
            if handles is not None:
                handles.close()

예제 #15

0

파일 보기

파일: parquet.py 프로젝트: praful-pra1/Machine-Learning-GMCA

    def write(
        self,
        df: DataFrame,
        path,
        compression="snappy",
        index=None,
        partition_cols=None,
        **kwargs,
    ):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if "partition_on" in kwargs and partition_cols is not None:
            raise ValueError(
                "Cannot use both partition_on and "
                "partition_cols. Use partition_cols for partitioning data")
        elif "partition_on" in kwargs:
            partition_cols = kwargs.pop("partition_on")

        if partition_cols is not None:
            kwargs["file_scheme"] = "hive"

        if is_fsspec_url(path):
            fsspec = import_optional_dependency("fsspec")

            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
            kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open(
            )
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(
                path,
                df,
                compression=compression,
                write_index=index,
                partition_on=partition_cols,
                **kwargs,
            )

예제 #16

0

파일 보기

파일: _json.py 프로젝트: tnir/pandas

    def _get_data_from_filepath(self, filepath_or_buffer):
        """
        The function read_json accepts three input types:
            1. filepath (string-like)
            2. file-like object (e.g. open file object, StringIO)
            3. JSON string

        This method turns (1) into (2) to simplify the rest of the processing.
        It returns input types (2) and (3) unchanged.

        It raises FileNotFoundError if the input is a string ending in
        one of .json, .json.gz, .json.bz2, etc. but no such file exists.
        """
        # if it is a string but the file does not exist, it might be a JSON string
        filepath_or_buffer = stringify_path(filepath_or_buffer)
        if (
            not isinstance(filepath_or_buffer, str)
            or is_url(filepath_or_buffer)
            or is_fsspec_url(filepath_or_buffer)
            or file_exists(filepath_or_buffer)
        ):
            self.handles = get_handle(
                filepath_or_buffer,
                "r",
                encoding=self.encoding,
                compression=self.compression,
                storage_options=self.storage_options,
                errors=self.encoding_errors,
            )
            filepath_or_buffer = self.handles.handle
        elif (
            isinstance(filepath_or_buffer, str)
            and filepath_or_buffer.lower().endswith(
                (".json",) + tuple(f".json{c}" for c in _extension_to_compression)
            )
            and not file_exists(filepath_or_buffer)
        ):
            raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")

        return filepath_or_buffer

예제 #17

0

파일 보기

파일: xml.py 프로젝트: HarishKSrivsatava/pandas-1

def get_data_from_filepath(
    filepath_or_buffer,
    encoding,
    compression,
    storage_options,
) -> Union[str, bytes, Buffer]:
    """
    Extract raw XML data.

    The method accepts three input types:
        1. filepath (string-like)
        2. file-like object (e.g. open file object, StringIO)
        3. XML string or bytes

    This method turns (1) into (2) to simplify the rest of the processing.
    It returns input types (2) and (3) unchanged.
    """
    filepath_or_buffer = stringify_path(filepath_or_buffer)

    if (isinstance(filepath_or_buffer, str)
            and not filepath_or_buffer.startswith(
                ("<?xml", "<"))) and (not isinstance(filepath_or_buffer, str)
                                      or is_url(filepath_or_buffer)
                                      or is_fsspec_url(filepath_or_buffer)
                                      or file_exists(filepath_or_buffer)):
        with get_handle(
                filepath_or_buffer,
                "r",
                encoding=encoding,
                compression=compression,
                storage_options=storage_options,
        ) as handle_obj:
            filepath_or_buffer = (handle_obj.handle.read() if hasattr(
                handle_obj.handle, "read") else handle_obj.handle)

    return filepath_or_buffer

예제 #18

0

파일 보기

파일: xml.py 프로젝트: tnir/pandas

    def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
        """
        Iterparse xml nodes.

        This method will read in local disk, decompressed XML files for elements
        and underlying descendants using iterparse, a method to iterate through
        an XML tree without holding entire XML tree in memory.

        Raises
        ------
        TypeError
            * If `iterparse` is not a dict or its dict value is not list-like.
        ParserError
            * If `path_or_buffer` is not a physical, decompressed file on disk.
            * If no data is returned from selected items in `iterparse`.

        Notes
        -----
        Namespace URIs will be removed from return node values. Also,
        elements with missing children or attributes in submitted list
        will have optional keys filled with None values.
        """

        dicts: list[dict[str, str | None]] = []
        row: dict[str, str | None] | None = None

        if not isinstance(self.iterparse, dict):
            raise TypeError(
                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
            )

        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
        if not is_list_like(self.iterparse[row_node]):
            raise TypeError(
                f"{type(self.iterparse[row_node])} is not a valid type "
                "for value in iterparse"
            )

        if (
            not isinstance(self.path_or_buffer, str)
            or is_url(self.path_or_buffer)
            or is_fsspec_url(self.path_or_buffer)
            or self.path_or_buffer.startswith(("<?xml", "<"))
            or infer_compression(self.path_or_buffer, "infer") is not None
        ):
            raise ParserError(
                "iterparse is designed for large XML files that are fully extracted on "
                "local disk and not as compressed files or online sources."
            )

        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag

            if event == "start":
                if curr_elem == row_node:
                    row = {}

            if row is not None:
                if self.names:
                    for col, nm in zip(self.iterparse[row_node], self.names):
                        if curr_elem == col:
                            elem_val = elem.text.strip() if elem.text else None
                            if row.get(nm) != elem_val and nm not in row:
                                row[nm] = elem_val
                        if col in elem.attrib:
                            if elem.attrib[col] not in row.values() and nm not in row:
                                row[nm] = elem.attrib[col]
                else:
                    for col in self.iterparse[row_node]:
                        if curr_elem == col:
                            row[col] = elem.text.strip() if elem.text else None
                        if col in elem.attrib:
                            row[col] = elem.attrib[col]

            if event == "end":
                if curr_elem == row_node and row is not None:
                    dicts.append(row)
                    row = None

                elem.clear()
                if hasattr(elem, "getprevious"):
                    while (
                        elem.getprevious() is not None and elem.getparent() is not None
                    ):
                        del elem.getparent()[0]

        if dicts == []:
            raise ParserError("No result from selected items in iterparse.")

        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

        if self.names:
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

        return dicts

예제 #19

0

파일 보기

    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(
                    path, engine=engine, columns=columns, **kwargs
                )

        if not columns:
            import fsspec.core
            from pandas.io.common import is_fsspec_url

            fs, path_ = (
                fsspec.core.url_to_fs(path, **(kwargs.get("storage_options") or {}))
                if is_fsspec_url(path)
                else (None, path)
            )

            dataset = ParquetDataset(path_, filesystem=fs, use_legacy_dataset=False)
            column_names = dataset.schema.names

            if dataset.schema.pandas_metadata is not None:
                index_columns = dataset.schema.pandas_metadata.get("index_columns", [])
                column_names = [c for c in column_names if c not in index_columns]
            columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)]
        return cls.build_query_compiler(path, columns, **kwargs)

예제 #20

-1

파일 보기

    def read(self,
             path,
             columns=None,
             storage_options: StorageOptions = None,
             **kwargs):
        path = stringify_path(path)
        handles = None
        fs = kwargs.pop("filesystem", None)
        if is_fsspec_url(path) and fs is None:
            fsspec = import_optional_dependency("fsspec")

            fs, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
        elif storage_options:
            raise ValueError(
                "storage_options passed with buffer or non-fsspec filepath")
        if not fs and isinstance(path, str) and not os.path.isdir(path):
            # use get_handle only when we are very certain that it is not a directory
            # fsspec resources can also point to directories
            # this branch is used for example when reading from non-fsspec URLs
            handles = get_handle(path, "rb", is_text=False)
            path = handles.handle

        kwargs["use_pandas_metadata"] = True
        result = self.api.parquet.read_table(path,
                                             columns=columns,
                                             filesystem=fs,
                                             **kwargs).to_pandas()

        if handles is not None:
            handles.close()

        return result