def write_woodwork_table(dataframe, path, profile_name=None, **kwargs): """Serialize Woodwork table and write to disk or S3 path. Args: dataframe (pd.DataFrame, dd.DataFrame, ks.DataFrame): DataFrame with Woodwork typing information initialized. path (str) : Location on disk to write the Woodwork table. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile. """ if _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: os.makedirs(os.path.join(tmpdir, 'data')) _dump_table(dataframe, tmpdir, **kwargs) file_path = _create_archive(tmpdir) transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, read=False, transport_params=transport_params) elif _is_url(path): raise ValueError("Writing to URLs is not supported") else: path = os.path.abspath(path) os.makedirs(os.path.join(path, 'data'), exist_ok=True) _dump_table(dataframe, path, **kwargs)
def read_woodwork_table(path, profile_name=None, validate=False, **kwargs): """Read Woodwork table from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `woodwork_typing_info.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. validate (bool, optional): Whether parameter and data validation should occur when initializing Woodwork dataframe during deserialization. Defaults to False. Note: If serialized data was modified outside of Woodwork and you are unsure of the validity of the data or typing information, `validate` should be set to True. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. Returns: DataFrame: DataFrame with Woodwork typing information initialized. """ if _is_url(path) or _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, transport_params) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) table_typing_info = read_table_typing_information(tmpdir) return _typing_information_to_woodwork_table( table_typing_info, validate, **kwargs) else: table_typing_info = read_table_typing_information(path) return _typing_information_to_woodwork_table(table_typing_info, validate, **kwargs)
def read_datatable(path, profile_name=None, **kwargs): '''Read DataTable from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `table_description.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. ''' if _is_url(path) or _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, transport_params) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) table_description = read_table_description(tmpdir) return description_to_datatable(table_description, **kwargs) else: table_description = read_table_description(path) return description_to_datatable(table_description, **kwargs)
def write_datatable(datatable, path, profile_name=None, **kwargs): '''Serialize datatable and write to disk or S3 path. Args: datatable (DataTable) : Instance of :class:`.DataTable`. path (str) : Location on disk to write datatable data and description. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile. ''' if _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: os.makedirs(os.path.join(tmpdir, 'data')) dump_table(datatable, tmpdir, **kwargs) file_path = create_archive(tmpdir) transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, read=False, transport_params=transport_params) elif _is_url(path): raise ValueError("Writing to URLs is not supported") else: path = os.path.abspath(path) os.makedirs(os.path.join(path, 'data'), exist_ok=True) dump_table(datatable, path, **kwargs)
def save_to_s3(self, profile_name): """Serialize data and typing information to S3.""" with tempfile.TemporaryDirectory() as tmpdir: self.write_path = tmpdir self.save_to_local_path() archive_file_path = self._create_archive() transport_params = get_transport_params(profile_name) use_smartopen( archive_file_path, self.path, read=False, transport_params=transport_params, )
def read_from_s3(self, profile_name): with tempfile.TemporaryDirectory() as tmpdir: tar_filename = Path(self.path).name tar_filepath = os.path.join(tmpdir, tar_filename) transport_params = None if _is_s3(self.path): transport_params = get_transport_params(profile_name) use_smartopen(tar_filepath, self.path, transport_params) with tarfile.open(str(tar_filepath)) as tar: tar.extractall(path=tmpdir) self.read_path = os.path.join(tmpdir, self.data_subdirectory, self.filename) return self.read_from_local_path()
def read_from_s3(self, profile_name): """Read data from S3 into a dataframe""" with tempfile.TemporaryDirectory() as tmpdir: tar_filename = Path(self.path).name tar_filepath = os.path.join(tmpdir, tar_filename) transport_params = None if _is_s3(self.path): transport_params = get_transport_params(profile_name) use_smartopen(tar_filepath, self.path, transport_params) with tarfile.open(str(tar_filepath)) as tar: tar.extractall(path=tmpdir) self.read_path = os.path.join( tmpdir, self.typing_info["loading_info"]["location"]) return self.read_from_local_path()
def read_table_typing_information(path, typing_info_filename, profile_name): """Read Woodwork typing information from disk, S3 path, or URL. Args: path (str): Location on disk, S3 path, or URL to read typing info file. typing_info_filename (str): Name of JSON file in which typing info is stored. profile_name (str, bool): The AWS profile specified to access to S3. Returns: dict: Woodwork typing information dictionary """ if _is_url(path) or _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, transport_params) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) file = os.path.join(tmpdir, typing_info_filename) with open(file, "r") as file: typing_info = json.load(file) else: path = os.path.abspath(path) assert os.path.exists(path), '"{}" does not exist'.format(path) file = os.path.join(path, typing_info_filename) with open(file, "r") as file: typing_info = json.load(file) typing_info["path"] = path return typing_info