示例#1
0
def prepare_dataflows(csv_file_path, label_column='duration_minutes'):
    import azureml.dataprep as dprep

    dataflow_schema = {
        'taxi_type': dprep.FieldType.STRING,
        'store_and_fwd_flag': dprep.FieldType.BOOLEAN,
        'passenger_count': dprep.FieldType.INTEGER,
        'trip_distance': dprep.FieldType.DECIMAL,
        'vendor_abbreviation': dprep.FieldType.STRING,
        'rate_code_description': dprep.FieldType.STRING,
        'pickup_borough': dprep.FieldType.STRING,
        'pickup_zone': dprep.FieldType.STRING,
        'pickup_service_zone': dprep.FieldType.STRING,
        'dropoff_borough': dprep.FieldType.STRING,
        'dropoff_zone': dprep.FieldType.STRING,
        'dropoff_service_zone': dprep.FieldType.STRING,
        'pickup_year': dprep.FieldType.INTEGER,
        'pickup_month': dprep.FieldType.INTEGER,
        'pickup_day': dprep.FieldType.INTEGER,
        'pickup_hour': dprep.FieldType.INTEGER,
        'is_rush_hour_flag': dprep.FieldType.BOOLEAN,
        'is_weekend_flag': dprep.FieldType.BOOLEAN,
        'duration_minutes': dprep.FieldType.DECIMAL,
    }

    dataflow = dprep.read_csv(csv_file_path)
    dataflow = dataflow.set_column_types(dataflow_schema)

    return dataflow.keep_columns([label_column
                                  ]), dataflow.drop_columns([label_column])
示例#2
0
    def test_dprep_datastream(self):
        import azureml.dataprep as dprep

        dates = ["2018-01-02 00:00:00", "2018-02-01 10:00:00"]
        col2 = ['0', '1']
        label_array = np.repeat([0], 2)
        train_df = pd.DataFrame({
            'col1': dates,
            'col2': col2,
            'label': label_array
        })

        pipeline = Pipeline(steps=[
            Handler(columns={'2': 'col2'},
                    concat=False,
                    impute_by_slot=True,
                    replace_with='Mean')
        ])

        file_name = get_temp_file('.csv')
        train_df.to_csv(file_name)

        dataflow = dprep.read_csv(file_name, infer_column_types=True)
        dprepDataStream = DprepDataStream(dataflow)

        result = pipeline.fit_transform(dprepDataStream)

        self.assertEqual(result.loc[:, 'col1'].dtype,
                         np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'col1'].year, 2018)
        self.assertEqual(result.loc[0, 'col1'].month, 1)
        self.assertEqual(result.loc[0, 'col1'].day, 2)
        self.assertEqual(result.loc[0, 'col1'].hour, 0)
        self.assertEqual(result.loc[0, 'col1'].minute, 0)
        self.assertEqual(result.loc[0, 'col1'].second, 0)

        self.assertEqual(result.loc[1, 'col1'].year, 2018)
        self.assertEqual(result.loc[1, 'col1'].month, 2)
        self.assertEqual(result.loc[1, 'col1'].day, 1)
        self.assertEqual(result.loc[1, 'col1'].hour, 10)
        self.assertEqual(result.loc[1, 'col1'].minute, 0)
        self.assertEqual(result.loc[1, 'col1'].second, 0)

        os.remove(file_name)
示例#3
0
def dataFlowProcessingLoop(previousStageNumber, thisStageNumber, qualityFlag,
                           operatorToUse, functionToCall, **kwargs):

    # Load the dataFlow controller file
    dataFlows = dprep.read_csv('dataFlowController.csv').to_pandas_dataframe()

    # Set up empty dataframes that we will use to build up inventories at both dataFlow and column level
    dataFlowInventoryAll = pd.DataFrame()
    columnInventoryAll = pd.DataFrame()

    for index, row in dataFlows.iterrows():

        dataName = row["DataName"]
        operationFlag = row[operatorToUse]

        newDataFlow, columnInventory, dataFlowInventory = functionToCall(
            dataName, previousStageNumber, thisStageNumber, qualityFlag,
            operatorToUse, operationFlag, **kwargs)

        if newDataFlow:

            # Capture the column inventory for the new dataflow
            columnInventoryAll = columnInventoryAll.append(columnInventory)
            print('{0}: appended {1} rows to column inventory'.format(
                dataName, len(columnInventory)))

            # Capture the data flow inventory for the new data flow
            dataFlowInventoryAll = dataFlowInventoryAll.append(
                dataFlowInventory)
            print('{0}: appended {1} rows to data flow inventory'.format(
                dataName, len(dataFlowInventory)))

    # Once we have processed all dataflows, we save the inventories away
    saveColumnInventory(columnInventoryAll, thisStageNumber)
    saveDataFlowInventory(dataFlowInventoryAll, thisStageNumber)

    return dataFlowInventoryAll
示例#4
0
    packageToSave = packageToSave.save(fullPackagePath)
    return fullPackagePath

# An open package helper function
def openPackage(packageName, stage, qualityFlag):
    fullPackagePath = createFullPackagePath(packageName, stage, qualityFlag)
    packageToOpen = Package.open(fullPackagePath)
    dataFlow = packageToOpen[packageName]
    return dataFlow

#%% [markdown]
# ## Prepare for ingestion...

#%%
# Load in file names to be processed from the config.csv file
dataFiles = dprep.read_csv('dataFiles.csv').to_pandas_dataframe()

# Create a fully qualified path to the data files and append this to the dataFiles data frame
fullFilePaths = dataPath + '/' + dataFiles.FileName
fullFilePaths.name = "FullFilePath"
dataFiles = pd.concat([dataFiles, fullFilePaths], axis=1)

# now grab the number of headers in the first row of each file
headerCount = []
for index, row in dataFiles.iterrows():
    firstRow = open(row["FullFilePath"]).readline().strip()
    regexPattern = re.compile(',\w')
    patternCount = len(re.findall(regexPattern,firstRow))
    headerCount.append(patternCount + 1)
columnCount = pd.DataFrame({'ColumnCount':headerCount})
dataFiles = pd.concat([dataFiles, columnCount], axis=1)
示例#5
0
parser = argparse.ArgumentParser("merge")
parser.add_argument("--input_green_merge",
                    type=str,
                    help="cleaned green taxi data directory")
parser.add_argument("--input_yellow_merge",
                    type=str,
                    help="cleaned yellow taxi data directory")
parser.add_argument("--output_merge",
                    type=str,
                    help="green and yellow taxi data merged")

args = parser.parse_args()

print("Argument 1(input green taxi data path): %s" % args.input_green_merge)
print("Argument 2(input yellow taxi data path): %s" % args.input_yellow_merge)
print("Argument 3(output merge taxi data path): %s" % args.output_merge)

green_df = dprep.read_csv(args.input_green_merge + '/part-*')
yellow_df = dprep.read_csv(args.input_yellow_merge + '/part-*')

# Appending yellow data to green data
combined_df = green_df.append_rows([yellow_df])

if not (args.output_merge is None):
    os.makedirs(args.output_merge, exist_ok=True)
    print("%s created" % args.output_merge)
    write_df = combined_df.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_merge))
    write_df.run_local()
                    help="cleaned taxi data directory")
parser.add_argument("--useful_columns",
                    type=str,
                    help="useful columns to keep")
parser.add_argument("--columns", type=str, help="rename column pattern")

args = parser.parse_args()

print("Argument 1(input taxi data path): %s" % args.input_cleanse)
print("Argument 2(columns to keep): %s" %
      str(args.useful_columns.strip("[]").split("\;")))
print("Argument 3(columns renaming mapping): %s" %
      str(args.columns.strip("{}").split("\;")))
print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse)

raw_df = dprep.read_csv(path=args.input_cleanse,
                        header=dprep.PromoteHeadersMode.GROUPED)

# These functions ensure that null data is removed from the data set,
# which will help increase machine learning model accuracy.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep
# for more details

useful_columns = [
    s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")
]
columns = get_dict(args.columns)

all_columns = dprep.ColumnSelector(term=".*", use_regex=True)
drop_if_all_null = [
    all_columns,
    dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)
示例#7
0
import os as os
import re as re
import collections
from azureml.dataprep import value
from azureml.dataprep import col
from azureml.dataprep import Dataflow
from commonInventoryCreation import getColumnStats, getDataFlowStats, saveColumnInventory, saveDataFlowInventory
from commonPackageHandling import saveDataFlowPackage

# Let's also set up global variables...
previousStageNumber = '00'
thisStageNumber = '10'

#%%
# Load in file names to be processed from the data file inventory
dataFileStats = dprep.read_csv('dataFileInventory_' + previousStageNumber +
                               '.csv').to_pandas_dataframe()

#%%
# First a quick pass through each file to grab the number of headers and count columns
# NOTE - this loop could improved such that there is less code the dataFileStats dataframe above
headerCount = []
for index, row in dataFileStats.iterrows():
    firstRow = open(row["FullFilePath"]).readline().strip()
    regexPattern = re.compile(',\w')
    patternCount = len(re.findall(regexPattern, firstRow))
    headerCount.append(patternCount + 1)
    print(firstRow)
    print(patternCount)
headerCountCol = pd.DataFrame({'HeaderCount': headerCount})
dataFileStats = pd.concat([dataFileStats, headerCountCol], axis=1)
示例#8
0
args = parser.parse_args()

print("Argument 1(input taxi data features path): %s" %
      args.input_split_features)
print("Argument 2(input taxi data labels path): %s" % args.input_split_labels)
print("Argument 3(output training features split path): %s" %
      args.output_split_train_x)
print("Argument 4(output training labels split path): %s" %
      args.output_split_train_y)
print("Argument 5(output test features split path): %s" %
      args.output_split_test_x)
print("Argument 6(output test labels split path): %s" %
      args.output_split_test_y)

x_df = dprep.read_csv(
    path=args.input_split_features,
    header=dprep.PromoteHeadersMode.GROUPED).to_pandas_dataframe()
y_df = dprep.read_csv(
    path=args.input_split_labels,
    header=dprep.PromoteHeadersMode.GROUPED).to_pandas_dataframe()

# These functions splits the input features and labels into test and train data
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail

x_train, x_test, y_train, y_test = train_test_split(x_df,
                                                    y_df,
                                                    test_size=0.2,
                                                    random_state=223)

if not (args.output_split_train_x is None and args.output_split_test_x is None
        and args.output_split_train_y is None
示例#9
0
    compute_target.wait_for_completion(show_output=True,
                                       min_node_count=True,
                                       timeout_in_minutes=20)

    print(compute_target.status.serialize())
#%%
os.makedirs('./data', exist_ok=True)

# INSERT DATA SOURCE HERE
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)
# ds.upload(src_dir='./data', target_path='AssetData', overwrite=True, show_progress=True)

#%%
asset_data_path = 'data/AssetData_Historical.csv'
asset_data_df = dprep.read_csv(path=asset_data_path,
                               header=dprep.PromoteHeadersMode.GROUPED)
display(asset_data_df.head(5))
#%%
dprep_path = os.path.join(os.getcwd(), 'dflows.dprep')
dflow_prepared = asset_data_df
package = dprep.Package([dflow_prepared])
package.save(dprep_path)
#%%
package_saved = dprep.Package.open(dprep_path)
dflow_prepared = package_saved.dataflows[0]
dflow_prepared.get_profile()

#%%
int_type = dprep.TypeConverter(dprep.FieldType.INTEGER)
dflow_prepared = dflow_prepared.set_column_types(
    type_conversions={'Failure_NextHour': int_type})
示例#10
0
training_sdf = trainingSDF
training_sdf = training_sdf.drop("Idx", "initialDebt")

training_sdf \
.drop("SeriousDlqin2yrs") \
.toPandas() \
.to_csv("/dbfs/FileStore/tables/constant-scoring-training-vars.csv")

training_sdf \
.select("SeriousDlqin2yrs") \
.toPandas() \
.to_csv("/dbfs/FileStore/tables/constant-scoring-training-res.csv")

X_train = dataprep.read_csv(
    path="/dbfs/FileStore/tables/constant-scoring-training-vars.csv",
    separator=',')
X_train = X_train.drop_columns("Column1")

Y_train = dataprep.read_csv(
    path="/dbfs/FileStore/tables/constant-scoring-training-res.csv",
    separator=',')
Y_train = Y_train.drop_columns("Column1")

# COMMAND ----------

# MAGIC %md
# MAGIC Checking to make sure we have data inside.

# COMMAND ----------
def createUPMDataflow(dataName, previousStageNumber, thisStageNumber,
                      qualityFlag, operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            mappingConfig = dprep.read_csv(
                './Config/' + operationFlag).to_pandas_dataframe()

            targetDataFlow = dataFlow
            columnsToKeep = ''

            for sourceTable in mappingConfig[mappingConfig.SourceTable ==
                                             dataName]['SourceTable'].unique():
                for sourceColumn, targetColumn in mappingConfig[
                        mappingConfig.SourceTable == sourceTable][[
                            'SourceColumn', 'TargetColumn'
                        ]].values:
                    if columnsToKeep is '':
                        columnsToKeep = targetColumn
                    else:
                        columnsToKeep = columnsToKeep + '|' + targetColumn

                    targetDataFlow = targetDataFlow.rename_columns(
                        {sourceColumn: targetColumn})

            targetDataFlow = targetDataFlow.drop_columns(
                dprep.ColumnSelector(columnsToKeep, True, True, invert=True))
            newPackageName = next(
                iter(mappingConfig[mappingConfig.SourceTable == dataName]
                     ['TargetTable'].unique()))

            createNewPackageDirectory(newPackageName)
            saveDataFlowPackage(targetDataFlow, newPackageName,
                                thisStageNumber, 'A')

        else:
            print('{0}: no duplicate processing required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
示例#12
0
    "and define the minimum and maximum bounds for each field.")

parser = argparse.ArgumentParser("filter")
parser.add_argument("--input_filter",
                    type=str,
                    help="merged taxi data directory")
parser.add_argument("--output_filter",
                    type=str,
                    help="filter out out of city locations")

args = parser.parse_args()

print("Argument 1(input taxi data path): %s" % args.input_filter)
print("Argument 2(output filtered taxi data path): %s" % args.output_filter)

combined_df = dprep.read_csv(args.input_filter + '/part-*')

# These functions filter out coordinates for locations that are outside the city border.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details

# Create a condensed view of the dataflow to just show the lat/long fields,
# which makes it easier to evaluate missing or out-of-scope coordinates
decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)
combined_df = combined_df.set_column_types(
    type_conversions={
        "pickup_longitude": decimal_type,
        "pickup_latitude": decimal_type,
        "dropoff_longitude": decimal_type,
        "dropoff_latitude": decimal_type
    })
示例#13
0
    # Wait for the cluster to complete, show the output log
    cpu_cluster.wait_for_completion(show_output=True)

# COMMAND ----------

import azureml.dataprep as dprep

# COMMAND ----------

dataset_root = "https://dprepdata.blob.core.windows.net/demo"

green_path = "/".join([dataset_root, "green-small/*"])
yellow_path = "/".join([dataset_root, "yellow-small/*"])

green_df = dprep.read_csv(path=green_path,
                          header=dprep.PromoteHeadersMode.GROUPED)
# auto_read_file will automatically identify and parse the file type, and is useful if you don't know the file type
yellow_df = dprep.auto_read_file(path=yellow_path)

green_df.head(5)
yellow_df.head(5)

# COMMAND ----------

all_columns = dprep.ColumnSelector(term=".*", use_regex=True)
drop_if_all_null = [
    all_columns,
    dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)
]
useful_columns = [
    "cost", "distance", "dropoff_datetime", "dropoff_latitude",
print("Transforms the renamed taxi data to the required format")

parser = argparse.ArgumentParser("transform")
parser.add_argument("--input_transform", type=str, help="renamed taxi data")
parser.add_argument("--output_transform",
                    type=str,
                    help="transformed taxi data")

args = parser.parse_args()

print("Argument 1(input taxi data path): %s" % args.input_transform)
print("Argument 2(output final transformed taxi data): %s" %
      args.output_transform)

renamed_df = dprep.read_csv(args.input_transform + '/part-*')

# These functions transform the renamed data to be used finally for training.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details

# Split the pickup and dropoff date further into the day of the week, day of the month, and month values.
# To get the day of the week value, use the derive_column_by_example() function.
# The function takes an array parameter of example objects that define the input data,
# and the preferred output. The function automatically determines your preferred transformation.
# For the pickup and dropoff time columns, split the time into the hour, minute, and second by using
# the split_column_by_example() function with no example parameter. After you generate the new features,
# use the drop_columns() function to delete the original fields as the newly generated features are preferred.
# Rename the rest of the fields to use meaningful descriptions.

transformed_features_df = (renamed_df.derive_column_by_example(
    source_columns="pickup_date",
示例#15
0
                    type=str,
                    help="input featurization")
parser.add_argument("--useful_columns", type=str, help="columns to use")
parser.add_argument("--output_featurization",
                    type=str,
                    help="output featurization")

args = parser.parse_args()

print("Argument 1(input training data path): %s" % args.input_featurization)
print("Argument 2(column features to use): %s" %
      str(args.useful_columns.strip("[]").split("\;")))
print("Argument 3:(output featurized training data path) %s" %
      args.output_featurization)

dflow_prepared = dprep.read_csv(args.input_featurization + '/part-*')

# These functions extracts useful features for training
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail

useful_columns = [
    s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")
]
dflow = dflow_prepared.keep_columns(useful_columns)

if not (args.output_featurization is None):
    os.makedirs(args.output_featurization, exist_ok=True)
    print("%s created" % args.output_featurization)
    write_df = dflow.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_featurization))
    write_df.run_local()
示例#16
0
print("Prepare data for training")

parser = argparse.ArgumentParser("prep_data")
parser.add_argument("--input_file", type=str, help="input raw data file")
parser.add_argument("--output_path", type=str, help="output prepped data path")

args, unknown = parser.parse_known_args()
if (unknown):
    print("Unknown args:")
    print(unknown)

print("Argument 1 (input training data file): %s" % args.input_file)
print("Argument 2 (output prepped training data path) %s" % args.output_path)

input_file = dprep.read_csv(args.input_file)

prepped_data = (
    input_file.drop_columns(
        columns='skin'
    )  # skin is same as thickness with another unit (inches/cm)
    .replace(columns='diabetes', find="TRUE", replace_with="1").replace(
        columns='diabetes', find="FALSE", replace_with="0").set_column_types(
            type_conversions={
                'diabetes': dprep.TypeConverter(
                    data_type=dprep.FieldType.INTEGER)
            }))

if not (args.output_path is None):
    os.makedirs(args.output_path, exist_ok=True)
    print("%s created" % args.output_path)
示例#17
0
args, unknown = parser.parse_known_args()
if (unknown):
    print("Unknown args:")
    print(unknown)

print("Argument 1 (input prepared data): %s" % args.input_prepared_data)
print("Argument 2 (output training features split path): %s" %
      args.output_split_train_x)
print("Argument 3 (output training labels split path): %s" %
      args.output_split_train_y)
print("Argument 4 (output test features split path): %s" %
      args.output_split_test_x)
print("Argument 5 (output test labels split path): %s" %
      args.output_split_test_y)

input_data = dprep.read_csv(args.input_prepared_data)

feature_names = [
    'num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi',
    'diab_pred', 'age'
]
label_names = ['diabetes']

print("Features:")
print(feature_names)
print("Labels:")
print(label_names)

x_df = input_data.keep_columns(feature_names).to_pandas_dataframe()
y_df = input_data.keep_columns(label_names).to_pandas_dataframe()
示例#18
0
parser = argparse.ArgumentParser("split")
parser.add_argument("--input_data_frame", type=str, help="input data frame")
parser.add_argument("--output_train_frame",
                    type=str,
                    help="output train frame")
parser.add_argument("--output_val_frame",
                    type=str,
                    help="output validation frame")

args = parser.parse_args()

print("Argument 1(input data frame path): {}".format(args.input_data_frame))
print("Argument 2(output training frame path): {}".format(
    args.output_train_frame))
print("Argument 3(output validation frame path): {}".format(
    args.output_val_frame))

input_df = dprep.read_csv(
    path=args.input_data_frame,
    header=dprep.PromoteHeadersMode.SAMEALLFILES).to_pandas_dataframe()

idx = int(input_df.shape[0] * 0.8)  #Getting the index of the first 80% of rows

input_df = input_df[['date', 'value']]

train_df = input_df[:idx]
val_df = input_df[idx:]

if not (args.output_train_frame is None and args.output_val_frame is None):
    write_output(train_df, args.output_train_frame)
    write_output(val_df, args.output_val_frame)
示例#19
0
parser = argparse.ArgumentParser("normalize")
parser.add_argument("--input_normalize",
                    type=str,
                    help="combined and converted taxi data")
parser.add_argument("--output_normalize",
                    type=str,
                    help="replaced undefined values and renamed columns")

args = parser.parse_args()

print("Argument 1(input taxi data path): %s" % args.input_normalize)
print("Argument 2(output normalized taxi data path): %s" %
      args.output_normalize)

combined_converted_df = dprep.read_csv(args.input_normalize + '/part-*')

# These functions replace undefined values and rename to use meaningful names.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details

replaced_stfor_vals_df = combined_converted_df.replace(
    columns="store_forward", find="0",
    replace_with="N").fill_nulls("store_forward", "N")

replaced_distance_vals_df = replaced_stfor_vals_df.replace(
    columns="distance", find=".00", replace_with=0).fill_nulls("distance", 0)

replaced_distance_vals_df = replaced_distance_vals_df.to_number(["distance"])

time_split_df = (replaced_distance_vals_df.split_column_by_example(
    source_column="pickup_datetime").split_column_by_example(
示例#20
0
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj: # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']:
                    fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else: # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj['datastoreName'] # mandatory
                data_path = dataprep_json_obj['dataPath'] # mandatory
                label_column = dataprep_json_obj['label'] # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                        separator=separator,
                                        header=header,
                                        encoding=encoding,
                                        quoting=quoting,
                                        skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info("SDK version does not support column names extraction, fallback to old path")
                    fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X)

                try:
                    fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e):
                logger.debug("User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")
def joinTables(dataName, previousStageNumber, thisStageNumber, qualityFlag,
               operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        # Set up empty intermediate dataframes that we will use to build up inventories at both dataFlow and column level
        dataFlowInventoryIntermediate = pd.DataFrame()
        columnInventoryIntermediate = pd.DataFrame()

        if operationFlag != '':

            # Load config file
            joinConfig = dprep.read_csv('./Config/' +
                                        operationFlag).to_pandas_dataframe()

            # For each config in the file...
            for index, row in joinConfig.iterrows():

                leftDataName = row['LeftDataName']
                leftDataFlowJoinColumn = row['LeftDataFlowJoinColumn']
                rightDataName = row['RightDataName']
                rightDataFlowJoinColumn = row['RightDataFlowJoinColumn']
                joinType = row['JoinType']
                print(
                    '{0}: ready to join {1} {2} -> {3} {4} using jointype {5}'.
                    format(dataName, leftDataName, leftDataFlowJoinColumn,
                           rightDataName, rightDataFlowJoinColumn, joinType))

                # Load right hand data flow
                rightDataFlow, fullPackagePath = openDataFlowPackage(
                    rightDataName, previousStageNumber, qualityFlag)
                print('{0}: loaded package from path {1}'.format(
                    rightDataName, fullPackagePath))

                # We always perform the inner "MATCH" stype join
                join_builder = dataFlow.builders.join(
                    right_dataflow=rightDataFlow,
                    left_column_prefix=dataName + '_',
                    right_column_prefix=rightDataName + '_')
                join_builder.detect_column_info()
                join_builder.join_key_pairs = [(leftDataFlowJoinColumn,
                                                rightDataFlowJoinColumn)]
                # Setting up join type:
                # NONE = 0
                # MATCH = 2
                # UNMATCHLEFT = 4
                # UNMATCHRIGHT = 8
                join_builder.join_type = 2
                innerDataFlow = join_builder.to_dataflow()
                print('{0} created inner dataflow : Columns : {1}, Rows : {2}'.
                      format(dataName,
                             len(innerDataFlow.get_profile().columns),
                             innerDataFlow.row_count))

                if joinType == "LEFT":
                    # Use the "UNMATCHLEFT" setting to grab the rows that haven't been joined from the left data flow
                    join_builder.join_type = 4
                    leftUnmatchedDataFlow = join_builder.to_dataflow()
                    print(
                        '{0} created left unmatched dataflow : Columns : {1}, Rows : {2}'
                        .format(
                            dataName,
                            len(leftUnmatchedDataFlow.get_profile().columns),
                            leftUnmatchedDataFlow.row_count))

                    # Now append this dataflow to the original inner join dataflow, to create a "left outer join"
                    newDataFlow = innerDataFlow.append_rows(
                        [leftUnmatchedDataFlow])
                else:
                    newDataFlow = innerDataFlow

                # Create a new name for this data flow based on concatenation of left dataflow and right
                newDataName = dataName + '_' + rightDataName

                # Output key stats
                print('{0} left table : {0}, Columns : {1}, Rows : {2}'.format(
                    leftDataName, len(dataFlow.get_profile().columns),
                    dataFlow.row_count))
                print(
                    '{0} right table : {0}, Columns : {1}, Rows : {2}'.format(
                        rightDataName,
                        len(rightDataFlow.get_profile().columns),
                        rightDataFlow.row_count))

                newDataProfile = newDataFlow.get_profile()

                print(
                    '{0} joined table : {0}, Columns : {1}, Rows : {2}'.format(
                        newDataName, len(newDataProfile.columns),
                        newDataFlow.row_count))

                # Now generate column and data flow inventories
                columnInventory = getColumnStats(newDataProfile, newDataName,
                                                 thisStageNumber,
                                                 operatorToUse, operationFlag)
                dataFlowInventory = getDataFlowStats(
                    newDataFlow, newDataProfile, newDataName, thisStageNumber,
                    operatorToUse, operationFlag)

                # Capture the column inventory for the new dataflow
                columnInventoryIntermediate = columnInventoryIntermediate.append(
                    columnInventory)

                # Capture the data flow inventory for the new data flow
                dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
                    dataFlowInventory)

                # Finally save the data flow so it can be passed onto the next stage of the process...
                targetPackagePath = saveDataFlowPackage(
                    newDataFlow, newDataName, thisStageNumber, 'A')
                print('{0}: saved package to {1}'.format(
                    newDataName, targetPackagePath))

        else:
            print('{0}: no joining of tables required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        columnInventoryIntermediate = columnInventoryIntermediate.append(
            columnInventory)

        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)
        dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
            dataFlowInventory)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved source package to {1}'.format(
            dataName, targetPackagePath))

        return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None