def open_input_file(self, path): from pyarrow import PythonFile if not self.fs.isfile(path): raise FileNotFoundError(path) return PythonFile(self.fs.open(path, mode="rb"), mode="r")
def write_parquet(self, table: Table, object_uri: str, metadata: dict[str, str]): logger.info( f"Attempting to upload: {object_uri}", extra={ "event": "ATTEMPTING_UPLOAD_PARQUET_TO_S3", "object_uri": object_uri }, ) s3_object = self._object_from_uri(object_uri) buffer = BytesIO() buffer_file = PythonFile(buffer) parquet.write_table(table, buffer_file) buffer.seek(0) s3_object.put(Body=buffer, Metadata=metadata) logger.info( f"Successfully uploaded to: {object_uri}", extra={ "event": "SUCCESSFULLY_UPLOADED_PARQUET_TO_S3", "object_uri": object_uri }, )
def open_input_file(self, path): from pyarrow import PythonFile def real_open(): return self.fs.open_input_file(path) full_path = f'{self.scheme}://{path}' # TODO: we may wait to cache the mmapped file if full_path not in self._file_cache: f = CachedFile(real_open, full_path, read_as_buffer=not self.for_arrow) self._file_cache[full_path] = f else: previous = self._file_cache[full_path] f = CachedFile(real_open, full_path, data_file=previous.data_file, mask_file=previous.mask_file, read_as_buffer=not self.for_arrow) if not self.for_arrow: return f f = vaex.file.FileProxy(f, full_path, None) return PythonFile(f, mode="r")
def open_append_stream(self, path): from pyarrow import PythonFile return PythonFile(self.fs.open(path, mode="ab"), mode="w")
def open_output_stream(self, path, metadata): from pyarrow import PythonFile return PythonFile(self.fs.open(path, mode="wb"), mode="w")