Python DataItem.local примеры использования

Язык программирования: Python

Пространство имен/Пакет: mlrun.datastore

Класс/Тип: DataItem

Метод/Функция: local

Примеров на hotexamples.com: 3

Python DataItem.local - 3 примера найдено. Это лучшие примеры Python кода для mlrun.datastore.DataItem.local, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

as_df(12)

local(3)

endswith(1)

pop(1)

Основные методы

as_df (12)

local (3)

endswith (1)

pop (1)

Пример #1

Показать файл

Файл: open_archive.py Проект: daniels290813/functions

def open_archive(
    context: MLClientCtx,
    archive_url: DataItem,
    subdir: str = "content",
    key: str = "content",
    target_path: str = None,
):
    """Open a file/object archive into a target directory

    Currently supports zip and tar.gz

    :param context:      function execution context
    :param archive_url:  url of archive file
    :param subdir:       path within artifact store where extracted files
                         are stored
    :param key:          key of archive contents in artifact store
    :param target_path:  file system path to store extracted files (use either this or subdir)
    """
    os.makedirs(target_path or subdir, exist_ok=True)

    archive_url = archive_url.local()
    if archive_url.endswith("gz"):
        with tarfile.open(archive_url, mode="r|gz") as ref:
            ref.extractall(target_path or subdir)
    elif archive_url.endswith("zip"):
        with zipfile.ZipFile(archive_url, "r") as ref:
            ref.extractall(target_path or subdir)
    else:
        raise ValueError(f"unsupported archive type in {archive_url}")

    kwargs = {}
    if target_path:
        kwargs = {"target_path": target_path}
    else:
        kwargs = {"local_path": subdir}
    context.log_artifact(key, **kwargs)

Пример #2

Показать файл

Файл: describe-spark.py Проект: sabariask/functions

def describe_spark(context: MLClientCtx,
                   dataset: DataItem,
                   artifact_path,
                   bins: int = 30,
                   describe_extended: bool = True):

    location = dataset.local()

    spark = SparkSession.builder.appName("Spark job").getOrCreate()

    df = spark.read.csv(location, header=True, inferSchema=True)

    kwargs = []

    float_cols = [
        item[0] for item in df.dtypes
        if item[1].startswith('float') or item[1].startswith('double')
    ]

    if describe_extended == True:

        table, variables, freq = describe(df, bins, float_cols, kwargs)

        tbl_1 = variables.reset_index()

        if len(freq) != 0:
            tbl_2 = pd.DataFrame.from_dict(
                freq, orient="index").sort_index().stack().reset_index()
            tbl_2.columns = ['col', 'key', 'val']
            tbl_2['Merged'] = [{
                key: val
            } for key, val in zip(tbl_2.key, tbl_2.val)]
            tbl_2 = tbl_2.groupby(
                'col',
                as_index=False).agg(lambda x: tuple(x))[['col', 'Merged']]

            summary = pd.merge(tbl_1,
                               tbl_2,
                               how='left',
                               left_on='index',
                               right_on='col')

        else:
            summary = tbl_1

        context.log_dataset("summary_stats",
                            df=summary,
                            format="csv",
                            index=False,
                            artifact_path=context.artifact_subpath('data'))

        context.log_results(table)

    else:
        tbl_1 = df.describe().toPandas()

        summary = tbl_1.T

        context.log_dataset("summary_stats",
                            df=summary,
                            format="csv",
                            index=False,
                            artifact_path=context.artifact_subpath('data'))

    spark.stop()

Пример #3

Показать файл

def arc_to_parquet(context: MLClientCtx,
                   archive_url: DataItem,
                   header: List[str] = [None],
                   chunksize: int = 0,
                   dtype=None,
                   encoding: str = "latin-1",
                   key: str = "data",
                   dataset: str = "None",
                   part_cols=[],
                   file_ext: str = "parquet",
                   index: bool = False,
                   refresh_data: bool = False,
                   stats: bool = False) -> None:
    """Open a file/object archive and save as a parquet file or dataset

    Notes
    -----
    * this function is typically for large files, please be sure to check all settings
    * partitioning requires precise specification of column types.
    * the archive_url can be any file readable by pandas read_csv, which includes tar files
    * if the `dataset` parameter is not empty, then a partitioned dataset will be created
    instead of a single file in the folder `dataset`
    * if a key exists already then it will not be re-acquired unless the `refresh_data` param
    is set to `True`.  This is in case the original file is corrupt, or a refresh is
    required.

    :param context:        the function context
    :param archive_url:    MLRun data input (DataItem object)
    :param chunksize:      (0) when > 0, row size (chunk) to retrieve
                           per iteration
    :param dtype           destination data type of specified columns
    :param encoding        ("latin-8") file encoding
    :param key:            key in artifact store (when log_data=True)
    :param dataset:        (None) if not None then "target_path/dataset"
                           is folder for partitioned files
    :param part_cols:      ([]) list of partitioning columns
    :param file_ext:       (parquet) csv/parquet file extension
    :param index:          (False) pandas save index option
    :param refresh_data:   (False) overwrite existing data at that location
    :param stats:          (None) calculate table stats when logging artifact
    """
    base_path = context.artifact_path
    os.makedirs(base_path, exist_ok=True)

    archive_url = archive_url.local()

    if dataset is not None:
        dest_path = os.path.join(base_path, dataset)
        exists = os.path.isdir(dest_path)
    else:
        dest_path = os.path.join(base_path, key + f".{file_ext}")
        exists = os.path.isfile(dest_path)

    if not exists:
        context.logger.info("destination file does not exist, downloading")
        if chunksize > 0:
            header = _chunk_readwrite(archive_url, dest_path, chunksize,
                                      encoding, dtype, dataset)
            context.log_dataset(key=key,
                                stats=stats,
                                format='parquet',
                                target_path=dest_path)
        else:
            df = pd.read_csv(archive_url)
            context.log_dataset(key, df=df, format=file_ext, index=index)
    else:
        context.logger.info("destination file already exists, nothing done")