Exemplo n.º 1
0
def open_archive(
    context: MLClientCtx,
    archive_url: DataItem,
    subdir: str = "content",
    key: str = "content",
    target_path: str = None,
):
    """Open a file/object archive into a target directory

    Currently supports zip and tar.gz

    :param context:      function execution context
    :param archive_url:  url of archive file
    :param subdir:       path within artifact store where extracted files
                         are stored
    :param key:          key of archive contents in artifact store
    :param target_path:  file system path to store extracted files (use either this or subdir)
    """
    os.makedirs(target_path or subdir, exist_ok=True)

    archive_url = archive_url.local()
    if archive_url.endswith("gz"):
        with tarfile.open(archive_url, mode="r|gz") as ref:
            ref.extractall(target_path or subdir)
    elif archive_url.endswith("zip"):
        with zipfile.ZipFile(archive_url, "r") as ref:
            ref.extractall(target_path or subdir)
    else:
        raise ValueError(f"unsupported archive type in {archive_url}")

    kwargs = {}
    if target_path:
        kwargs = {"target_path": target_path}
    else:
        kwargs = {"local_path": subdir}
    context.log_artifact(key, **kwargs)
Exemplo n.º 2
0
def describe_spark(context: MLClientCtx,
                   dataset: DataItem,
                   artifact_path,
                   bins: int = 30,
                   describe_extended: bool = True):

    location = dataset.local()

    spark = SparkSession.builder.appName("Spark job").getOrCreate()

    df = spark.read.csv(location, header=True, inferSchema=True)

    kwargs = []

    float_cols = [
        item[0] for item in df.dtypes
        if item[1].startswith('float') or item[1].startswith('double')
    ]

    if describe_extended == True:

        table, variables, freq = describe(df, bins, float_cols, kwargs)

        tbl_1 = variables.reset_index()

        if len(freq) != 0:
            tbl_2 = pd.DataFrame.from_dict(
                freq, orient="index").sort_index().stack().reset_index()
            tbl_2.columns = ['col', 'key', 'val']
            tbl_2['Merged'] = [{
                key: val
            } for key, val in zip(tbl_2.key, tbl_2.val)]
            tbl_2 = tbl_2.groupby(
                'col',
                as_index=False).agg(lambda x: tuple(x))[['col', 'Merged']]

            summary = pd.merge(tbl_1,
                               tbl_2,
                               how='left',
                               left_on='index',
                               right_on='col')

        else:
            summary = tbl_1

        context.log_dataset("summary_stats",
                            df=summary,
                            format="csv",
                            index=False,
                            artifact_path=context.artifact_subpath('data'))

        context.log_results(table)

    else:
        tbl_1 = df.describe().toPandas()

        summary = tbl_1.T

        context.log_dataset("summary_stats",
                            df=summary,
                            format="csv",
                            index=False,
                            artifact_path=context.artifact_subpath('data'))

    spark.stop()
Exemplo n.º 3
0
def arc_to_parquet(context: MLClientCtx,
                   archive_url: DataItem,
                   header: List[str] = [None],
                   chunksize: int = 0,
                   dtype=None,
                   encoding: str = "latin-1",
                   key: str = "data",
                   dataset: str = "None",
                   part_cols=[],
                   file_ext: str = "parquet",
                   index: bool = False,
                   refresh_data: bool = False,
                   stats: bool = False) -> None:
    """Open a file/object archive and save as a parquet file or dataset

    Notes
    -----
    * this function is typically for large files, please be sure to check all settings
    * partitioning requires precise specification of column types.
    * the archive_url can be any file readable by pandas read_csv, which includes tar files
    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
    instead of a single file in the folder `dataset`
    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
    is set to `True`.  This is in case the original file is corrupt, or a refresh is
    required.

    :param context:        the function context
    :param archive_url:    MLRun data input (DataItem object)
    :param chunksize:      (0) when > 0, row size (chunk) to retrieve
                           per iteration
    :param dtype           destination data type of specified columns
    :param encoding        ("latin-8") file encoding
    :param key:            key in artifact store (when log_data=True)
    :param dataset:        (None) if not None then "target_path/dataset"
                           is folder for partitioned files
    :param part_cols:      ([]) list of partitioning columns
    :param file_ext:       (parquet) csv/parquet file extension
    :param index:          (False) pandas save index option
    :param refresh_data:   (False) overwrite existing data at that location
    :param stats:          (None) calculate table stats when logging artifact
    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)

    archive_url = archive_url.local()

    if dataset is not None:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key + f".{file_ext}")
        exists = os.path.isfile(dest_path)

    if not exists:
        context.logger.info("destination file does not exist, downloading")
        if chunksize > 0:
            header = _chunk_readwrite(archive_url, dest_path, chunksize,
                                      encoding, dtype, dataset)
            context.log_dataset(key=key,
                                stats=stats,
                                format='parquet',
                                target_path=dest_path)
        else:
            df = pd.read_csv(archive_url)
            context.log_dataset(key, df=df, format=file_ext, index=index)
    else:
        context.logger.info("destination file already exists, nothing done")