def write(self, df, path, compression='snappy', index=None, partition_cols=None, **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if 'partition_on' in kwargs and partition_cols is not None: raise ValueError("Cannot use both partition_on and " "partition_cols. Use partition_cols for " "partitioning data") elif 'partition_on' in kwargs: partition_cols = kwargs.pop('partition_on') if partition_cols is not None: kwargs['file_scheme'] = 'hive' if is_s3_url(path): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' path, _, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs)
def read(self, path, columns=None, **kwargs): if is_s3_url(path): # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. s3, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) else: path, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs)
def read(self, path, columns=None, **kwargs): if is_s3_url(path): from pandas.io.s3 import get_file_and_filesystem # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. s3, filesystem = get_file_and_filesystem(path) try: parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) finally: s3.close() else: path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs)
def write( self, df: DataFrame, path, compression="snappy", index=None, partition_cols=None, **kwargs, ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if "partition_on" in kwargs and partition_cols is not None: raise ValueError( "Cannot use both partition_on and " "partition_cols. Use partition_cols for " "partitioning data" ) elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") if partition_cols is not None: kwargs["file_scheme"] = "hive" if is_s3_url(path) or is_gcs_url(path): # if path is s3:// or gs:// we need to open the file in 'wb' mode. # TODO: Support 'ab' path, _, _, _ = get_filepath_or_buffer(path, mode="wb") # And pass the opened file to the fastparquet internal impl. kwargs["open_with"] = lambda path, _: path else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write( path, df, compression=compression, write_index=index, partition_on=partition_cols, **kwargs, )
def write(self, df, path, compression='snappy', **kwargs): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. if is_s3_url(path): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' path, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: path, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, compression=compression, **kwargs)
def test_is_s3_url(self): assert is_s3_url("s3://pandas/somethingelse.com") assert not is_s3_url("s4://pandas/somethingelse.com")
def test_is_s3_url(self): assert is_s3_url("s3://qq_pandas/somethingelse.com") assert not is_s3_url("s4://qq_pandas/somethingelse.com")