Exemplo n.º 1
0
    def write(self, df, path, compression='snappy', index=None,
              partition_cols=None, **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if 'partition_on' in kwargs and partition_cols is not None:
            raise ValueError("Cannot use both partition_on and "
                             "partition_cols. Use partition_cols for "
                             "partitioning data")
        elif 'partition_on' in kwargs:
            partition_cols = kwargs.pop('partition_on')

        if partition_cols is not None:
            kwargs['file_scheme'] = 'hive'

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path, df, compression=compression,
                           write_index=index, partition_on=partition_cols,
                           **kwargs)
Exemplo n.º 2
0
    def write(self, df, path, compression='snappy', index=None,
              partition_cols=None, **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if 'partition_on' in kwargs and partition_cols is not None:
            raise ValueError("Cannot use both partition_on and "
                             "partition_cols. Use partition_cols for "
                             "partitioning data")
        elif 'partition_on' in kwargs:
            partition_cols = kwargs.pop('partition_on')

        if partition_cols is not None:
            kwargs['file_scheme'] = 'hive'

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path, df, compression=compression,
                           write_index=index, partition_on=partition_cols,
                           **kwargs)
Exemplo n.º 3
0
    def read(self, path, columns=None, **kwargs):
        if is_s3_url(path):
            # When path is s3:// an S3File is returned.
            # We need to retain the original path(str) while also
            # pass the S3File().open function to fsatparquet impl.
            s3, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
        else:
            path, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)
Exemplo n.º 4
0
    def read(self, path, columns=None, **kwargs):
        if is_s3_url(path):
            # When path is s3:// an S3File is returned.
            # We need to retain the original path(str) while also
            # pass the S3File().open function to fsatparquet impl.
            s3, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open)
        else:
            path, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)
Exemplo n.º 5
0
    def read(self, path, columns=None, **kwargs):
        if is_s3_url(path):
            from pandas.io.s3 import get_file_and_filesystem

            # When path is s3:// an S3File is returned.
            # We need to retain the original path(str) while also
            # pass the S3File().open function to fsatparquet impl.
            s3, filesystem = get_file_and_filesystem(path)
            try:
                parquet_file = self.api.ParquetFile(path, open_with=filesystem.open)
            finally:
                s3.close()
        else:
            path, _, _, _ = get_filepath_or_buffer(path)
            parquet_file = self.api.ParquetFile(path)

        return parquet_file.to_pandas(columns=columns, **kwargs)
Exemplo n.º 6
0
    def write(
        self,
        df: DataFrame,
        path,
        compression="snappy",
        index=None,
        partition_cols=None,
        **kwargs,
    ):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if "partition_on" in kwargs and partition_cols is not None:
            raise ValueError(
                "Cannot use both partition_on and "
                "partition_cols. Use partition_cols for "
                "partitioning data"
            )
        elif "partition_on" in kwargs:
            partition_cols = kwargs.pop("partition_on")

        if partition_cols is not None:
            kwargs["file_scheme"] = "hive"

        if is_s3_url(path) or is_gcs_url(path):
            # if path is s3:// or gs:// we need to open the file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
            # And pass the opened file to the fastparquet internal impl.
            kwargs["open_with"] = lambda path, _: path
        else:
            path, _, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(
                path,
                df,
                compression=compression,
                write_index=index,
                partition_on=partition_cols,
                **kwargs,
            )
Exemplo n.º 7
0
    def write(self, df, path, compression='snappy', **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path, df, compression=compression, **kwargs)
Exemplo n.º 8
0
    def write(self, df, path, compression='snappy', **kwargs):
        self.validate_dataframe(df)
        # thriftpy/protocol/compact.py:339:
        # DeprecationWarning: tostring() is deprecated.
        # Use tobytes() instead.

        if is_s3_url(path):
            # path is s3:// so we need to open the s3file in 'wb' mode.
            # TODO: Support 'ab'

            path, _, _ = get_filepath_or_buffer(path, mode='wb')
            # And pass the opened s3file to the fastparquet internal impl.
            kwargs['open_with'] = lambda path, _: path
        else:
            path, _, _ = get_filepath_or_buffer(path)

        with catch_warnings(record=True):
            self.api.write(path, df,
                           compression=compression, **kwargs)
Exemplo n.º 9
0
 def test_is_s3_url(self):
     assert is_s3_url("s3://pandas/somethingelse.com")
     assert not is_s3_url("s4://pandas/somethingelse.com")
Exemplo n.º 10
0
 def test_is_s3_url(self):
     assert is_s3_url("s3://qq_pandas/somethingelse.com")
     assert not is_s3_url("s4://qq_pandas/somethingelse.com")