def open_archive( context: MLClientCtx, archive_url: DataItem, subdir: str = "content", key: str = "content", target_path: str = None, ): """Open a file/object archive into a target directory Currently supports zip and tar.gz :param context: function execution context :param archive_url: url of archive file :param subdir: path within artifact store where extracted files are stored :param key: key of archive contents in artifact store :param target_path: file system path to store extracted files (use either this or subdir) """ os.makedirs(target_path or subdir, exist_ok=True) archive_url = archive_url.local() if archive_url.endswith("gz"): with tarfile.open(archive_url, mode="r|gz") as ref: ref.extractall(target_path or subdir) elif archive_url.endswith("zip"): with zipfile.ZipFile(archive_url, "r") as ref: ref.extractall(target_path or subdir) else: raise ValueError(f"unsupported archive type in {archive_url}") kwargs = {} if target_path: kwargs = {"target_path": target_path} else: kwargs = {"local_path": subdir} context.log_artifact(key, **kwargs)
def describe_spark(context: MLClientCtx, dataset: DataItem, artifact_path, bins: int = 30, describe_extended: bool = True): location = dataset.local() spark = SparkSession.builder.appName("Spark job").getOrCreate() df = spark.read.csv(location, header=True, inferSchema=True) kwargs = [] float_cols = [ item[0] for item in df.dtypes if item[1].startswith('float') or item[1].startswith('double') ] if describe_extended == True: table, variables, freq = describe(df, bins, float_cols, kwargs) tbl_1 = variables.reset_index() if len(freq) != 0: tbl_2 = pd.DataFrame.from_dict( freq, orient="index").sort_index().stack().reset_index() tbl_2.columns = ['col', 'key', 'val'] tbl_2['Merged'] = [{ key: val } for key, val in zip(tbl_2.key, tbl_2.val)] tbl_2 = tbl_2.groupby( 'col', as_index=False).agg(lambda x: tuple(x))[['col', 'Merged']] summary = pd.merge(tbl_1, tbl_2, how='left', left_on='index', right_on='col') else: summary = tbl_1 context.log_dataset("summary_stats", df=summary, format="csv", index=False, artifact_path=context.artifact_subpath('data')) context.log_results(table) else: tbl_1 = df.describe().toPandas() summary = tbl_1.T context.log_dataset("summary_stats", df=summary, format="csv", index=False, artifact_path=context.artifact_subpath('data')) spark.stop()
def arc_to_parquet(context: MLClientCtx, archive_url: DataItem, header: List[str] = [None], chunksize: int = 0, dtype=None, encoding: str = "latin-1", key: str = "data", dataset: str = "None", part_cols=[], file_ext: str = "parquet", index: bool = False, refresh_data: bool = False, stats: bool = False) -> None: """Open a file/object archive and save as a parquet file or dataset Notes ----- * this function is typically for large files, please be sure to check all settings * partitioning requires precise specification of column types. * the archive_url can be any file readable by pandas read_csv, which includes tar files * if the `dataset` parameter is not empty, then a partitioned dataset will be created instead of a single file in the folder `dataset` * if a key exists already then it will not be re-acquired unless the `refresh_data` param is set to `True`. This is in case the original file is corrupt, or a refresh is required. :param context: the function context :param archive_url: MLRun data input (DataItem object) :param chunksize: (0) when > 0, row size (chunk) to retrieve per iteration :param dtype destination data type of specified columns :param encoding ("latin-8") file encoding :param key: key in artifact store (when log_data=True) :param dataset: (None) if not None then "target_path/dataset" is folder for partitioned files :param part_cols: ([]) list of partitioning columns :param file_ext: (parquet) csv/parquet file extension :param index: (False) pandas save index option :param refresh_data: (False) overwrite existing data at that location :param stats: (None) calculate table stats when logging artifact """ base_path = context.artifact_path os.makedirs(base_path, exist_ok=True) archive_url = archive_url.local() if dataset is not None: dest_path = os.path.join(base_path, dataset) exists = os.path.isdir(dest_path) else: dest_path = os.path.join(base_path, key + f".{file_ext}") exists = os.path.isfile(dest_path) if not exists: context.logger.info("destination file does not exist, downloading") if chunksize > 0: header = _chunk_readwrite(archive_url, dest_path, chunksize, encoding, dtype, dataset) context.log_dataset(key=key, stats=stats, format='parquet', target_path=dest_path) else: df = pd.read_csv(archive_url) context.log_dataset(key, df=df, format=file_ext, index=index) else: context.logger.info("destination file already exists, nothing done")