def get_data(): time_column_name = 'dtime' target_column_name = os.environ['FORECAST_FILE_PREFIX'] granularity = os.environ['FORECAST_GRANULARITY'] horizon = int(os.environ['FORECAST_HORIZON']) print('target:{}, granularity:{}, horizon:{}' \ .format(target_column_name, granularity, horizon)) # get data from dataset, below is equivalent to the following local call #df = pd.read_csv(csvfile, header=0, index_col=0, parse_dates=True) run = Run.get_context() workspace = run.experiment.workspace dataset = Dataset.get(workspace, target_column_name) df = dataset.to_pandas_dataframe() df.index = df[time_column_name] df.drop(time_column_name, inplace=True, axis=1) min_time, max_time = df.index.min(), df.index.max() try_split = split_train_test_by_granularity(granularity, horizon, min_time, max_time) if try_split is None: raise Exception('can not train data', min_time, max_time) else: (delta, frequency, training_slice_begin, training_slice_end, test_slice_begin, test_slice_end) = try_split print('train between %s and %s, forecast between %s and %s' % \ (training_slice_begin, training_slice_end, test_slice_begin, test_slice_end)) df = df.loc[training_slice_begin:test_slice_end, ] X_train, X_test, y_train, y_test = compute_train_test_for_automl(df, \ training_slice_begin, training_slice_end, test_slice_begin, test_slice_end, target_column_name, time_column_name) return X_train, y_train
def __init__(self, workspace, snapshot_name, dataset_id, definition_version=None, time_stamp=None, profile_action_id=None, datastore_name=None, relative_path=None, dataset_name=None): """Dataset snapshot is a combination of Profile and an optional materialized copy of the data. To learn more about Dataset Snapshots, go to https://aka.ms/azureml/howto/createsnapshots :param workspace: The workspace the Dataset is registered in. :type workspace: azureml.core.Workspace. :param snapshot_name: The name of the Dataset snapshot. :type snapshot_name: str :param dataset_id: The identifier of the Dataset. :type dataset_id: str :param definition_version: The definition version of the Dataset. :type definition_version: str :param time_stamp: The snapshot creation time. :type time_stamp: datetime :param profile_action_id: The snapshot profile action ID. :type profile_action_id: str :param datastore_name: The snapshot data store name. :type datastore_name: str :param relative_path: The relative path to the snapshot data. :type relative_path: str :param dataset_name: The name of the Dataset. :type dataset_name: str """ from azureml.core import Dataset self._workspace = workspace self._name = snapshot_name self._dataset_id = dataset_id self._definition_version = definition_version self._time_stamp = time_stamp self._profile_action_id = profile_action_id self._datastore_name = datastore_name self._relative_path = relative_path # This is a hack, we should either return the dataset name in the DTO or remove the _dataset_name field. dataset = Dataset.get(workspace, id=dataset_id) self._dataset_name = dataset_name or dataset.name
import os import datetime import shutil from azureml.core import Workspace, Datastore, Dataset, Experiment, Run from sklearn.model_selection import train_test_split from azureml.core.compute import ComputeTarget, AmlCompute from azureml.core.compute_target import ComputeTargetException from sklearn.tree import DecisionTreeClassifier run = Run.get_context() workspace = run.experiment.workspace dataset_name = 'clean_Titanic_tutorial' dataset = Dataset.get(workspace=workspace, name=dataset_name) df = dataset.to_pandas_dataframe() x_col = ['Pclass', 'Sex', 'SibSp', 'Parch'] y_col = ['Survived'] x_df = df.loc[:, x_col] y_df = df.loc[:, y_col] x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223) data = {"train": {"X": x_train, "y": y_train}, "test": {"X": x_test, "y": y_test}} clf = DecisionTreeClassifier().fit(data["train"]["X"], data["train"]["y"])
ds = Datastore.get(ws, datastore_name=datastore_name) compute_target = ws.compute_targets[compute_target] experiment_name = 'forecast_automl_' + file_prefix + '_' + granularity # environment for get_data.py time_column_name = 'dtime' script_folder = './' # where is get_data.py relative to current folder script_env = { 'FORECAST_FILE_PREFIX': file_prefix, 'FORECAST_GRANULARITY': granularity, 'FORECAST_HORIZON': horizon } # register dataset so get_data can access it try: dataset = Dataset.get(ws, file_prefix) print('using existing dataset:{0}'.format(file_prefix)) except: data_file = datastore_folder + file_prefix + '_' + granularity + '.csv' dataset = Dataset.from_delimited_files(ds.path(data_file)) dataset = dataset.register(ws, file_prefix) print('registered dataset:{0}'.format(file_prefix)) # Setup run configuration run_config = RunConfiguration(framework="python") run_config.target = compute_target run_config.environment.docker.enabled = True run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE run_config.environment.environment_variables = script_env dependencies = CondaDependencies.create( pip_packages=["scikit-learn", "scipy", "numpy"])
from azureml.core import Workspace, Datastore, Dataset from azureml.core.experiment import Experiment from azureml.pipeline.core import Pipeline, PipelineData from azureml.pipeline.steps import PythonScriptStep ws = Workspace.from_config(path="./file-path/ws_config.json") experiment = Experiment(workspace=ws, name='BrainStar') def_blob_store = Datastore(ws, "workspaceblobstore") compute_target = ws.compute_targets["BrainStar1"] input_data = Dataset.get(ws, name="Absence data") output_data1 = PipelineData("output_data1", datastore=def_blob_store, output_name="output_data1") source_directory = './process' step1 = PythonScriptStep(name="process_step", script_name="process.py", inputs=[input_data], outputs=[output_data1], compute_target=compute_target, source_directory=source_directory, allow_reuse=True) steps = step1 pipeline1 = Pipeline(workspace=ws, steps=steps) pipeline1.validate() pipeline_run1 = Experiment(ws, 'Hello_World1').submit(pipeline1, regenerate_outputs=False)