示例#1
0
    def driver_wrapper(script_directory, automl_settings, run_id,
                       training_percent, iteration, pipeline_spec, pipeline_id,
                       dataprep_json, entry_point, **kwargs):
        automl_settings_obj = _AutoMLSettings.from_string_or_dict(
            automl_settings)
        logger, sdk_has_custom_dimension_logger = _init_logger(
            automl_settings_obj)
        if sdk_has_custom_dimension_logger:
            logger.update_default_properties({
                "parent_run_id":
                _get_parent_run_id(run_id),
                "child_run_id":
                run_id
            })
        logger.info("[RunId:{}]: remote automl driver begins.".format(run_id))

        try:
            script_directory = _init_directory(directory=script_directory,
                                               logger=logger)

            automl_settings_obj, found_data_store, data_store = _get_automl_settings(
                automl_settings=automl_settings, logger=logger)

            transformed_data_context = _load_transformed_data_context_from_cache(
                automl_settings_obj=automl_settings_obj,
                parent_run_id=_get_parent_run_id(run_id),
                found_data_store=found_data_store,
                data_store=data_store,
                logger=logger)
            result = _start_run(
                automl_settings_obj=automl_settings_obj,
                run_id=run_id,
                training_percent=training_percent,
                iteration=iteration,
                pipeline_spec=pipeline_spec,
                pipeline_id=pipeline_id,
                dataprep_json=dataprep_json,
                script_directory=script_directory,
                entry_point=entry_point,
                logger=logger,
                transformed_data_context=transformed_data_context)
            _post_run(result=result,
                      run_id=run_id,
                      automl_settings=automl_settings,
                      logger=logger)
        except Exception as e:
            logger.error("driver_wrapper meets exceptions. {}".format(e))
            log_traceback(e, logger)
            raise Exception(e)

        logger.info(
            "[RunId:{}]: remote automl driver finishes.".format(run_id))
        return result
示例#2
0
    def setup_wrapper(script_directory, dataprep_json, entry_point,
                      automl_settings, task_type, preprocess,
                      enable_subsampling, num_iterations, **kwargs):
        automl_settings_obj = _AutoMLSettings.from_string_or_dict(
            automl_settings)

        logger, sdk_has_custom_dimension_logger = _init_logger(
            automl_settings_obj)
        try:
            child_run_id = Run.get_submitted_run()._run_id
            parent_run_id = _get_parent_run_id(child_run_id)
            if sdk_has_custom_dimension_logger:
                logger.update_default_properties({
                    "parent_run_id": parent_run_id,
                    "child_run_id": child_run_id
                })
            logger.info("[ParentRunId:{}]: remote setup script begins.".format(
                parent_run_id))
            script_directory = _init_directory(directory=script_directory,
                                               logger=logger)

            logger.info("Preparing data for set problem info now.")

            fit_iteration_parameters_dict = _prepare_data(
                dataprep_json=dataprep_json,
                automl_settings_obj=automl_settings_obj,
                script_directory=script_directory,
                entry_point=entry_point,
                logger=logger)
            fit_iteration_parameters_dict = _get_auto_cv_dict(
                fit_iteration_parameters_dict, automl_settings_obj, logger)

            print("Setting Problem Info now.")
            _set_problem_info_for_setup(
                fit_iteration_parameters_dict=fit_iteration_parameters_dict,
                automl_settings_obj=automl_settings_obj,
                task_type=task_type,
                preprocess=preprocess,
                enable_subsampling=enable_subsampling,
                num_iterations=num_iterations,
                logger=logger)
        except Exception as e:
            logger.error("setup_wrapper meets exceptions. {}".format(e))
            log_traceback(e, logger)
            raise Exception(e)

        _post_setup(logger=logger)
        logger.info("[ParentRunId:{}]: remote setup script finishes.".format(
            parent_run_id))
        return  # PLACEHOLDER for RemoteScript helper functions
示例#3
0
    def _post_run(result, run_id, automl_settings, logger):
        print("for Run Id : ", run_id)
        print("result : ", result)
        if len(result['errors']) > 0:
            err_type = next(iter(result['errors']))
            inner_ex = result['errors'][err_type]['exception']
            inner_ex.error_type = ErrorTypes.Client
            log_traceback(inner_ex, logger)
            raise RuntimeError(inner_ex) from inner_ex

        score = result[automl_settings['primary_metric']]
        duration = result['fit_time']
        print("Score : ", score)
        print("Duration : ", duration)
        print("Childrun completed successfully!")
        logger.info("Childrun completed successfully!")
示例#4
0
    def _set_problem_info_for_setup(fit_iteration_parameters_dict,
                                   automl_settings_obj, task_type, preprocess,
                                   enable_subsampling, num_iterations,
                                   logger):
        current_run = Run.get_submitted_run()
        logger.info("Start to set problem info for the setup for run id {}.".format(current_run._run_id))
        logger.info("Setup experiment.")
        try:
            experiment = current_run.experiment
            parent_run_id = _get_parent_run_id(current_run._run_id)
            data_store = experiment.workspace.get_default_datastore()
            found_data_store = True
            logger.info("Using data store.")
        except Exception as e:
            logger.warning("Getting data store, fallback to default {}".format(e))
            found_data_store = False

        logger.info("Caching supported {}.".format(sdk_has_cache_capability and found_data_store))
        print("caching supported {}".format(sdk_has_cache_capability and found_data_store))
        if sdk_has_validate_data_dict:
            # The newest version of validate_training_data_dict should contains check_x_y
            logger.info("Using validate_training_data_dict now.")
            validate_training_data_dict(data_dict=fit_iteration_parameters_dict, automl_settings=automl_settings_obj)
        else:
            logger.info("Using validate_training_data now.")
            validate_training_data(X=fit_iteration_parameters_dict.get('X'),
                                      y=fit_iteration_parameters_dict.get('y'),
                                      X_valid=fit_iteration_parameters_dict.get('X_valid'),
                                      y_valid=fit_iteration_parameters_dict.get('y_valid'),
                                      sample_weight=fit_iteration_parameters_dict.get('sample_weight'),
                                      sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'),
                                      cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'),
                                      automl_settings=automl_settings_obj)
            check_x_y(fit_iteration_parameters_dict.get('X'), fit_iteration_parameters_dict.get('y'), automl_settings_obj)
        if sdk_has_cache_capability and found_data_store:
            data_splits_validated = True
            try:
                start = time.time()
                transformed_data_context = _get_transformed_data_context(
                    X=fit_iteration_parameters_dict.get('X'),
                    y=fit_iteration_parameters_dict.get('y'),
                    X_valid=fit_iteration_parameters_dict.get('X_valid'),
                    y_valid=fit_iteration_parameters_dict.get('y_valid'),
                    sample_weight=fit_iteration_parameters_dict.get('sample_weight'),
                    sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'),
                    x_raw_column_names=fit_iteration_parameters_dict.get('x_raw_column_names'),
                    cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'),
                    automl_settings_obj=automl_settings_obj,
                    data_store=data_store,
                    run_target='remote',
                    parent_run_id=parent_run_id,
                    logger=logger
                )
                end = time.time()
                print("time taken for transform {}".format(end-start))
                logger.info("time taken for transform {}".format(end-start))
                if sdk_has_validate_data_splits:
                    try:
                        logger.info("Validating data splits now.")
                        _validate_data_splits(X=transformed_data_context.X,
                                              y=transformed_data_context.y,
                                              X_valid=transformed_data_context.X_valid,
                                              y_valid=transformed_data_context.y_valid,
                                              cv_splits=transformed_data_context.cv_splits,
                                              automl_settings=automl_settings_obj)
                        data_splits_validated = True
                    except Exception as data_split_exception:
                        data_splits_validated = False
                        logger.error("Meeting validation errors {}.".format(data_split_exception))
                        log_traceback(data_split_exception, logger)
                        raise data_split_exception
                logger.info("Start setting problem info.")
                automl.set_problem_info(transformed_data_context.X, transformed_data_context.y,
                                        automl_settings_obj.task_type,
                                        current_run=current_run,
                                        preprocess=automl_settings_obj.preprocess,
                                        lag_length=automl_settings_obj.lag_length,
                                        transformed_data_context=transformed_data_context,
                                        enable_cache=automl_settings_obj.enable_cache,
                                        subsampling=enable_subsampling)
            except Exception as e:
                if sdk_has_validate_data_splits and not data_splits_validated:
                    logger.error("sdk_has_validate_data_splits is True and data_splits_validated is False {}.".format(e))
                    log_traceback(e, logger)
                    raise e
                else:
                    logger.warning("Setup failed, fall back to old model {}".format(e))
                    print("Setup failed, fall back to old model {}".format(e))
                    automl.set_problem_info(
                        X=fit_iteration_parameters_dict.get('X'),
                        y=fit_iteration_parameters_dict.get('y'),
                        task_type=task_type, current_run=current_run,
                        preprocess=preprocess, subsampling=enable_subsampling
                    )
        else:
            logger.info("Start setting problem info using old model.")
            if sdk_has_validate_data_splits:
                _validate_data_splits(X=fit_iteration_parameters_dict.get('X'),
                                      y=fit_iteration_parameters_dict.get('y'),
                                      X_valid=fit_iteration_parameters_dict.get('X_valid'),
                                      y_valid=fit_iteration_parameters_dict.get('y_valid'),
                                      cv_splits=fit_iteration_parameters_dict.get('cv_splits_indices'),
                                      automl_settings=automl_settings_obj)
            automl.set_problem_info(
                X=fit_iteration_parameters_dict.get('X'),
                y=fit_iteration_parameters_dict.get('y'),
                task_type=task_type, current_run=current_run,
                preprocess=preprocess, subsampling=enable_subsampling
            )
示例#5
0
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj: # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']:
                    fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else: # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj['datastoreName'] # mandatory
                data_path = dataprep_json_obj['dataPath'] # mandatory
                label_column = dataprep_json_obj['label'] # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                        separator=separator,
                                        header=header,
                                        encoding=encoding,
                                        quoting=quoting,
                                        skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info("SDK version does not support column names extraction, fallback to old path")
                    fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X)

                try:
                    fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e):
                logger.debug("User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")