示例#1
0
def fetch_df(step, output_name):
    output_data = step.get_output_data(output_name)

    download_path = './outputs/' + output_name
    output_data.download(download_path)
    df_path = get_download_path(download_path, output_name) + '/data'
    return dprep.auto_read_file(path=df_path)
示例#2
0
def load_azureml_df(local_cache_path=None, file_split=Split.TRAIN, file_type="txt"):
    """
    Loads the SNLI dataset as AzureML dataflow object
    Download the dataset from "https://nlp.stanford.edu/projects/snli/snli_1.0.zip", unzip,
    and load.

    Args:
        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
            If None, all the intermediate files will be stored in a temporary directory and removed
            after use.
        file_split (str): File split to load. One of (dev, test, train)
        file_type (str): File type to load. One of (txt, jsonl)

    Returns:
        AzureML dataflow: SNLI dataset

    """
    with download_path(local_cache_path) as path:
        filepath = os.path.join(path, "snli_1.0.zip")
        snlipath = _maybe_download_and_extract(filepath, file_split, file_type)

        # NOTE: this works for the txt format but not the jsonl format
        df = dprep.auto_read_file(snlipath)

    return df
示例#3
0
def prepareDataForMLTraining(testSize, dataset_root, shouldShuffle):
    # dataset_root path must be set to where YOU are storing the .csv file.

    #Creates a DataFlow object from .csv file
    dataFlow = dprep.auto_read_file(dataset_root, False)

    #Defines the input (X) columns and the output-to-predict (Y) columns from the .csv file.
    dataflow_X = dataFlow.keep_columns([
        'Soil_Name', 'MEAN_Eleva', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)',
        'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)',
        'R.T.R(inch)'
    ])
    dataflow_Y = dataFlow.keep_columns('NormalizedYield')

    #Converts data into a pandas DataFrame to ease in splitting data into Train/Test sets.
    x_dataFlow_Pandas = dataflow_X.to_pandas_dataframe()
    y_dataFlow_Pandas = dataflow_Y.to_pandas_dataframe()

    #Splits data into a train portion and a test portion

    #User-defined parameters for how the dataset is split for training/testing purposes.
    #Test size should be a float between 0 and 1. Train size is 1 - testSize.

    #shouldShuffle determines whether data is given a final shuffle before division.

    X_toTrain, X_toTest, Y_toTrain, Y_toTest = train_test_split(
        x_dataFlow_Pandas,
        y_dataFlow_Pandas,
        test_size=testSize,
        random_state=173,
        shuffle=shouldShuffle)

    return (X_toTrain, X_toTest, Y_toTrain, Y_toTest)
示例#4
0
    def test_fit_transform(self):
        import azureml.dataprep as dprep

        path = get_dataset('infert').as_filepath()
        dflow = dprep.auto_read_file(path=path)
        dprep_data = DprepDataStream(dflow)
        file_data = FileDataStream.read_csv(path)

        xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'})
        pipe = Pipeline([xf])
        transformed_data = pipe.fit_transform(file_data)
        transformed_data1 = pipe.fit_transform(dprep_data)

        assert_array_equal(transformed_data.columns, transformed_data1.columns)
        assert_2d_array_equal(transformed_data.values,
                              transformed_data1.values)
示例#5
0
from azureml.core import Webservice
from azureml.core.webservice import AciWebservice

# try:
# setting the local env to hadnle missing packages
run_user_managed = RunConfiguration()
run_user_managed.environment.python.user_managed_dependencies = False

# Create workspace object for existing one and create an experiment
ws = Workspace.from_config('subscription.json')
print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t')
experiment = Experiment(workspace=ws, name='experiment1')

# full path to training data,testing data
file_path1 = os.path.join(os.getcwd(), "cumodelwo2014.csv")
dflowtr = dprep.auto_read_file(path=file_path1)
file_path2 = os.path.join(os.getcwd(), "test2014.csv")
dflowte = dprep.auto_read_file(path=file_path2)

# Specifying x(causal) and y(response) attributes in training data
dflowtr_x = dflowtr.keep_columns([
    'cell-ID', 'Soil_Name', 'MEAN_Yld_V', 'COUNT_Yld', 'MEAN_Eleva',
    'RANGE_Elev', 'Crop-Type', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)',
    'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)', 'R.T.R(inch)',
    'M.T.R(inch)'
])
dflowtr_y = dflowtr.keep_columns('NormalizedYield')
# causal factors in training dataframe
trainingx_df = dflowtr_x.to_pandas_dataframe()
# response variable in training dataframe
trainingy_df = dflowtr_y.to_pandas_dataframe()
示例#6
0
#***
## Part 2 - Data Ingestion

# %%
#Load the train and test files as Pandas dataframes...
df_train = pd.read_csv('./kaggle/input/train.csv')
df_train.shape
# %%
df_train.info()

# %%
df_test = pd.read_csv('./kaggle/input/test.csv')
df_test.shape

# %%
dataflow = dprep.auto_read_file('./kaggle/input/train.csv')
dataflow.head(10)

# %%
dataflow_test = dprep.auto_read_file('./kaggle/input/test.csv')
dataflow_test.head(10)

# %% [markdown]
# ***
## Part 3 - Feature Engineering I
# 
#Feature engineering necessary to support exploratory data analysis (EDA) below.
#
#In this section a range of feature extraction actions are performed using Microsoft's ML Dataprep SDK for Python:
#1. Append the test data set to training data set so that feature engineering can be addressed collectively;
#2. Filling in empty values in "Cabin" and "Fare" where simple rules can be applied;
示例#7
0
def main():
    
    # local compute
    run_user_managed = RunConfiguration()
    run_user_managed.environment.python.user_managed_dependencies = False

    # print to check azure sdk installation
    print(azureml.core.VERSION)

    # create workspace object to connect to omtest workspace in MLSERVICE
    ws = Workspace.from_config('./config.json')
    # default data store
    # ds = ws.get_default_datastore()
    # print(ds)

    # choose a name for the run history container in the workspace
    experiment_name = 'automated-ml-regression'
    # project folder
    project_folder = './automated-ml-regression'

    output = {}
    output['SDK version'] = azureml.core.VERSION
    output['Subscription ID'] = ws.subscription_id
    output['Workspace'] = ws.name
    output['Resource Group'] = ws.resource_group
    output['Location'] = ws.location
    output['Project Directory'] = project_folder
    pd.set_option('display.max_colwidth', -1)
    pd.DataFrame(data=output, index=['']).T

    # stats for all the columns
    dflow = dprep.auto_read_file(path='/Users/omprakashnekkanti/Desktop/Spring 2019/CS445-Capstone/automatedML/cuformodel.csv')
    print(type(dflow))
    dflow.get_profile()

    # filepath as a string
    file_path = os.path.join(os.getcwd(), 'cuformodel.csv')
    print(file_path)
    print(type(file_path))

    # dflow_prepared = dprep.Dataflow.open(file_path)
    # dflow_prepared.get_profile()

    dflow_X = dflow.keep_columns([
        'cell-ID', 'Soil_Name', 'MEAN_Yld_V', 'COUNT_Yld', 'MEAN_Eleva',
        'RANGE_Elev', 'Crop-Type', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)',
        'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)', 'R.T.R(inch)',
        'M.T.R(inch)'
    ])
    dflow_y = dflow.keep_columns('NormalizedYield')

    x_df = dflow_X.to_pandas_dataframe()
    y_df = dflow_y.to_pandas_dataframe()

    x_train, x_test, y_train, y_test = train_test_split(
        x_df, y_df, test_size=0.2, random_state=223)
    # flatten y_train to 1d array
    y_train.values.flatten()

    automl_settings = {
        "iteration_timeout_minutes": 20,
        "iterations": 40,
        "primary_metric": 'mean_absolute_error',
        "preprocess": False,
        "verbosity": logging.INFO,
        "n_cross_validations": 10
    }

    # local compute
    automated_ml_config = AutoMLConfig(
        task='regression',
        debug_log='automated_ml_errors.log',
        path=project_folder,
        X=x_train.values,
        y=y_train.values.flatten(),
        **automl_settings)
    experiment = Experiment(ws, experiment_name)
    local_run = experiment.submit(automated_ml_config, show_output=True)
示例#8
0
 def to_dataflow(self) -> 'dprep.Dataflow':
     import azureml.dataprep as dprep
     return dprep.auto_read_file(self.get_url())
示例#9
0
# COMMAND ----------

import azureml.dataprep as dprep

# COMMAND ----------

dataset_root = "https://dprepdata.blob.core.windows.net/demo"

green_path = "/".join([dataset_root, "green-small/*"])
yellow_path = "/".join([dataset_root, "yellow-small/*"])

green_df = dprep.read_csv(path=green_path,
                          header=dprep.PromoteHeadersMode.GROUPED)
# auto_read_file will automatically identify and parse the file type, and is useful if you don't know the file type
yellow_df = dprep.auto_read_file(path=yellow_path)

green_df.head(5)
yellow_df.head(5)

# COMMAND ----------

all_columns = dprep.ColumnSelector(term=".*", use_regex=True)
drop_if_all_null = [
    all_columns,
    dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)
]
useful_columns = [
    "cost", "distance", "dropoff_datetime", "dropoff_latitude",
    "dropoff_longitude", "passengers", "pickup_datetime", "pickup_latitude",
    "pickup_longitude", "store_forward", "vendor"
parser.add_argument("--d_ts_2", type=str, help="d_ts_2")
parser.add_argument("--output", type=str, help="output")
args = parser.parse_args()

# parse inputs
h_ts_1_dr = args.h_ts_1
h_ts_2_dr = args.h_ts_2
h_ts_3_dr = args.h_ts_3
d_ts_1_dr = args.d_ts_1
d_ts_2_dr = args.d_ts_2

# =======================
# azureml-dataprep-sdk.py
# =======================

h_ts_1_dflow = dprep.auto_read_file(h_ts_1_dr)
h_ts_2_dflow = dprep.auto_read_file(h_ts_2_dr)
h_ts_3_dflow = dprep.auto_read_file(h_ts_3_dr)
d_ts_1_dflow = dprep.auto_read_file(d_ts_1_dr)
d_ts_2_dflow = dprep.auto_read_file(d_ts_2_dr)

# Pivot data
h_ts_1_pivot_dflow = h_ts_1_dflow.pivot(
    ['NODE_ID'], 'MW',
    azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
    ['MYDATE', 'HOUR'])
h_ts_2_pivot_dflow = h_ts_2_dflow.pivot(
    ['NODE_ID'], 'MW',
    azureml.dataprep.api.engineapi.typedefinitions.SummaryFunction.MAX,
    ['MYDATE', 'HOUR'])