from azureml.core import Dataset, Datastore
from azureml.data.datapath import DataPath
from azureml.data.dataset_factory import TabularDatasetFactory

# Connect to the Azure Machine Learning Workspace
azureml_workspace = Workspace.from_config(auth=sp_auth)

# Like the DBFS Mount, the Azure ML Datastore references the same `processed` container on Azure Storage
processed_ds = Datastore.get(azureml_workspace, 'datastoreprocessed')

# Dataset A: a subset of comments in the gaming category.

# We will use it to run a quick feasiblity analysis experiment. As well to have a cost-effective way to experiment with changes while we iterate on model versions.

comments_subset_gaming_dataset = TabularDatasetFactory.from_parquet_files([
    DataPath(processed_ds, path) for path in match_pattern_on_storage(
        "redditcomments/subreddit=gaming/*.parquet")
])

# Dataset: the full set of comments for scale model training

comments_full_dataset = TabularDatasetFactory.from_parquet_files([
    DataPath(processed_ds, path)
    for path in match_pattern_on_storage("redditcomments/*/*.parquet")
])

# Register the data set versions in Azure ML for reference during training
comments_full_dataset.register(azureml_workspace,
                               name="redditcomments",
                               create_new_version=True,
                               description="The full dataset of comments")
示例#2
0
    def _create_tabular(self, parameters, validate):
        source_type = self._json_utility.try_get_value(
            parameters, self._prop_source_type, None,
            lambda v: v in self._valid_source_types,
            'Property "{}" must be one of {}.'.format(
                self._prop_source_type, self._valid_source_types))

        if source_type == 'sql_query':
            query = self._get_query(parameters)
            if is_dataprep_installed():
                return TabularDatasetFactory.from_sql_query(query, validate)
            return self._create_dataset_from_blocks(
                [_Block.craft_read_sql_block(query)], TabularDataset)

        path = self._get_path(parameters)
        include_path = self._json_utility.try_get_value(
            parameters, self._prop_include_path, self._default_include_path)
        partition_format = self._json_utility.try_get_value(
            parameters, self._prop_partition_format, None)

        if source_type == 'parquet_files':
            if is_dataprep_installed():
                return TabularDatasetFactory.from_parquet_files(
                    path,
                    validate,
                    include_path,
                    partition_format=partition_format)
            return self._create_dataset_from_blocks([
                _Block.craft_get_file_block(path),
                _Block.craft_read_parquet_block(),
                _Block.craft_partition_format_block(partition_format)
                if partition_format else None,
                _Block.craft_drop_path_column_block()
                if not include_path else None
            ], TabularDataset)

        if source_type == 'json_lines_files':
            if is_dataprep_installed():
                return TabularDatasetFactory.from_json_lines_files(
                    path,
                    validate,
                    include_path,
                    partition_format=partition_format)
            return self._create_dataset_from_blocks([
                _Block.craft_get_file_block(path),
                _Block.craft_read_json_lines_block(),
                _Block.craft_partition_format_block(partition_format)
                if partition_format else None,
                _Block.craft_drop_path_column_block()
                if not include_path else None
            ], TabularDataset)

        if source_type == 'delimited_files':
            infer_column_types = self._json_utility.try_get_value(
                parameters, self._prop_infer_column_types,
                self._default_infer_column_types)
            separator = self._json_utility.try_get_value(
                parameters, self._prop_separator, self._default_separator)
            header = self._json_utility.try_get_value(parameters,
                                                      self._prop_header,
                                                      self._default_header)
            if is_dataprep_installed():
                return TabularDatasetFactory.from_delimited_files(
                    path,
                    validate,
                    include_path=include_path,
                    partition_format=partition_format,
                    infer_column_types=infer_column_types,
                    separator=separator,
                    header=header)
            if infer_column_types:
                _raise_dataprep_missing_error(
                    'Cannot infer column types',
                    self._error_utility.get_error_message(
                        'setting {} to false'.format(
                            self._prop_infer_column_types)))
            return self._create_dataset_from_blocks([
                _Block.craft_get_file_block(path),
                _Block.craft_read_delimited_block(separator, header),
                _Block.craft_partition_format_block(partition_format)
                if partition_format else None,
                _Block.craft_drop_path_column_block()
                if not include_path else None
            ], TabularDataset)

        raise RuntimeError('Unexpected code path for source_type: ' +
                           source_type)