Exemplo n.º 1
0
    def run(
        self,
        dataset_name: str = None,
        datastore: Datastore = None,
        path: Union[str, List[str]] = None,
        dataset_description: str = "",
        dataset_tags: Optional[Dict[str, str]] = None,
        include_path: bool = False,
        set_column_types: Optional[Dict[str, DataType]] = None,
        fine_grain_timestamp: str = None,
        coarse_grain_timestamp: str = None,
        partition_format: str = None,
        create_new_version: bool = None,
    ) -> TabularDataset:
        """
        Task run method.

        Args:
            - dataset_name (str, optional): The name of the Dataset in the Workspace
            - datastore (azureml.core.datastore.Datastore, optional): The Datastore which holds
                the files.
            - path (Union[str, List[str]], optional): The path to the delimited files in the
                Datastore.
            - dataset_description (str, optional): Description of the Dataset.
            - dataset_tags (str, optional): Tags to associate with the Dataset.
            - include_path (bool, optional): Boolean to keep path information as column in the
                dataset.
            - set_column_types (Dict[str, azureml.data.DataType], optional): A dictionary to
                set column data type, where key is column name and value is a
                `azureml.data.DataType`.
            - fine_grain_timestamp (str, optional): The name of column as fine grain timestamp.
            - coarse_grain_timestamp (str, optional): The name of column coarse grain timestamp.
            - partition_format (str, optional): Specify the partition format of path.
            - create_new_version (bool, optional): Boolean to register the dataset as a new
                version under the specified name.

        Returns:
            - azureml.data.TabularDataset: the created TabularDataset.
        """
        if dataset_name is None:
            raise ValueError("A dataset_name must be provided.")

        if path is None:
            raise ValueError("A path must be provided.")

        if datastore is None:
            raise ValueError("A path must be provided.")

        if not isinstance(path, list):
            path = [path]

        dataset_tags = dataset_tags or dict()

        dataset = azureml.core.dataset.Dataset.Tabular.from_parquet_files(
            path=[(datastore, path_item) for path_item in path],
            include_path=include_path,
            set_column_types=set_column_types,
            partition_format=partition_format,
        )

        dataset = dataset.with_timestamp_columns(
            fine_grain_timestamp=fine_grain_timestamp,
            coarse_grain_timestamp=coarse_grain_timestamp,
            validate=True,
        )

        dataset = dataset.register(
            workspace=datastore.workspace,
            name=dataset_name,
            description=dataset_description,
            tags=dataset_tags,
            create_new_version=create_new_version,
        )

        return dataset
Exemplo n.º 2
0
    def run(
        self,
        dataset_name: str = None,
        datastore: Datastore = None,
        path: Union[str, List[str]] = None,
        dataset_description: str = "",
        dataset_tags: Optional[Dict[str, str]] = None,
        include_path: bool = False,
        infer_column_types: bool = True,
        set_column_types: Optional[Dict[str, DataType]] = None,
        fine_grain_timestamp: str = None,
        coarse_grain_timestamp: str = None,
        separator: str = ",",
        header: PromoteHeadersBehavior = PromoteHeadersBehavior.
        ALL_FILES_HAVE_SAME_HEADERS,
        partition_format: str = None,
        create_new_version: bool = False,
    ) -> TabularDataset:
        """
        Task run method.

        Args:
            - dataset_name (str, optional): The name of the Dataset in the Workspace
            - datastore (azureml.core.datastore.Datastore, optional): The Datastore which holds
                the files.
            - path (Union[str, List[str]], optional): The path to the delimited files in the
                Datastore.
            - dataset_description (str, optional): Description of the Dataset.
            - dataset_tags (str, optional): Tags to associate with the Dataset.
            - include_path (bool, optional): Boolean to keep path information as column in the
                dataset.
            - infer_column_types (bool, optional): Boolean to infer column data types.
            - set_column_types (Dict[str, azureml.data.DataType], optional): A dictionary to
                set column data type, where key is column name and value is a
                `azureml.data.DataType`.
            - fine_grain_timestamp (str, optional): The name of column as fine grain timestamp.
            - coarse_grain_timestamp (str, optional): The name of column coarse grain timestamp.
            - separator (str, optional): The separator used to split columns.
            - header (azureml.data.dataset_type_definitions.PromoteHeadersBehavior, optional):
                Controls how column headers are promoted when reading from files. Defaults to
                assume that all files have the same header.
            - partition_format (str, optional): Specify the partition format of path.
            - create_new_version (bool, optional): Boolean to register the dataset as a new
                version under the specified name.

        Returns:
            - azureml.data.TabularDataset: the created TabularDataset

        """
        if dataset_name is None:
            raise ValueError("A dataset_name must be provided.")

        if path is None:
            raise ValueError("A path must be provided.")

        if datastore is None:
            raise ValueError("A path must be provided.")

        if not isinstance(path, list):
            path = [path]

        dataset_tags = dataset_tags or dict()

        dataset = azureml.core.dataset.Dataset.Tabular.from_delimited_files(
            path=[(datastore, path_item) for path_item in path],
            include_path=include_path,
            infer_column_types=infer_column_types,
            set_column_types=set_column_types,
            separator=separator,
            header=header,
            partition_format=partition_format,
        )

        dataset = dataset.with_timestamp_columns(
            fine_grain_timestamp=fine_grain_timestamp,
            coarse_grain_timestamp=coarse_grain_timestamp,
            validate=True,
        )

        dataset = dataset.register(
            workspace=datastore.workspace,
            name=dataset_name,
            description=dataset_description,
            tags=dataset_tags,
            create_new_version=create_new_version,
        )

        return dataset