Exemplo n.º 1
0
def removeDuplicates(dataName, previousStageNumber, thisStageNumber,
                     qualityFlag, operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            columnsToKeep = operationFlag

            numberOfRowsBefore = dataFlow.row_count

            dataFlow = dataFlow.distinct(
                dprep.ColumnSelector(columnsToKeep, True, True, invert=False))
            print(
                '{0}: removed duplicates from column {1} rows before {2} rows afer {3}'
                .format(dataName, operationFlag, numberOfRowsBefore,
                        dataFlow.row_count))

        else:
            print('{0}: no duplicate processing required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
from azureml.train.automl import AutoMLConfig

target = "utilization"
ws = Workspace(
    workspace_name=dbutils.secrets.get("azureml",
                                       "AML_WORKSPACE_NAME"),  # noqa
    subscription_id=dbutils.secrets.get("azureml",
                                        "AML_SUBSCRIPTION_ID"),  # noqa
    resource_group=dbutils.secrets.get("azureml",
                                       "AML_RESOURCE_GROUP"),  # noqa
)
ds = ws.get_default_datastore()

x = dprep.read_parquet_file(ds.path('model_data_x.parquet'))
y = dprep.read_parquet_file(ds.path('model_data_y.parquet')).to_long(
    dprep.ColumnSelector(term='.*', use_regex=True))

project_folder = './automl'
automl_config = AutoMLConfig(
    task="regression",
    iteration_timeout_minutes=10,
    iterations=10,
    primary_metric="r2_score",
    n_cross_validations=5,
    debug_log="automl.log",
    verbosity=logging.INFO,
    spark_context=sc,  # noqa
    whitelist_models=[
        "GradientBoosting",
        "DecisionTree",
        "RandomForest",
def createUPMDataflow(dataName, previousStageNumber, thisStageNumber,
                      qualityFlag, operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            mappingConfig = dprep.read_csv(
                './Config/' + operationFlag).to_pandas_dataframe()

            targetDataFlow = dataFlow
            columnsToKeep = ''

            for sourceTable in mappingConfig[mappingConfig.SourceTable ==
                                             dataName]['SourceTable'].unique():
                for sourceColumn, targetColumn in mappingConfig[
                        mappingConfig.SourceTable == sourceTable][[
                            'SourceColumn', 'TargetColumn'
                        ]].values:
                    if columnsToKeep is '':
                        columnsToKeep = targetColumn
                    else:
                        columnsToKeep = columnsToKeep + '|' + targetColumn

                    targetDataFlow = targetDataFlow.rename_columns(
                        {sourceColumn: targetColumn})

            targetDataFlow = targetDataFlow.drop_columns(
                dprep.ColumnSelector(columnsToKeep, True, True, invert=True))
            newPackageName = next(
                iter(mappingConfig[mappingConfig.SourceTable == dataName]
                     ['TargetTable'].unique()))

            createNewPackageDirectory(newPackageName)
            saveDataFlowPackage(targetDataFlow, newPackageName,
                                thisStageNumber, 'A')

        else:
            print('{0}: no duplicate processing required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
Exemplo n.º 4
0
print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse)

raw_df = dprep.read_csv(path=args.input_cleanse,
                        header=dprep.PromoteHeadersMode.GROUPED)

# These functions ensure that null data is removed from the data set,
# which will help increase machine learning model accuracy.
# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep
# for more details

useful_columns = [
    s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")
]
columns = get_dict(args.columns)

all_columns = dprep.ColumnSelector(term=".*", use_regex=True)
drop_if_all_null = [
    all_columns,
    dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)
]

new_df = (raw_df.replace_na(columns=all_columns).drop_nulls(
    *drop_if_all_null).rename_columns(column_pairs=columns).keep_columns(
        columns=useful_columns))

if not (args.output_cleanse is None):
    os.makedirs(args.output_cleanse, exist_ok=True)
    print("%s created" % args.output_cleanse)
    write_df = new_df.write_to_csv(
        directory_path=dprep.LocalFileOutput(args.output_cleanse))
    write_df.run_local()