def driver_wrapper(script_directory, automl_settings, run_id, training_percent, iteration, pipeline_spec, pipeline_id, dataprep_json, entry_point, **kwargs): automl_settings_obj = _AutoMLSettings.from_string_or_dict( automl_settings) logger, sdk_has_custom_dimension_logger = _init_logger( automl_settings_obj) if sdk_has_custom_dimension_logger: logger.update_default_properties({ "parent_run_id": _get_parent_run_id(run_id), "child_run_id": run_id }) logger.info("[RunId:{}]: remote automl driver begins.".format(run_id)) try: script_directory = _init_directory(directory=script_directory, logger=logger) automl_settings_obj, found_data_store, data_store = _get_automl_settings( automl_settings=automl_settings, logger=logger) transformed_data_context = _load_transformed_data_context_from_cache( automl_settings_obj=automl_settings_obj, parent_run_id=_get_parent_run_id(run_id), found_data_store=found_data_store, data_store=data_store, logger=logger) result = _start_run( automl_settings_obj=automl_settings_obj, run_id=run_id, training_percent=training_percent, iteration=iteration, pipeline_spec=pipeline_spec, pipeline_id=pipeline_id, dataprep_json=dataprep_json, script_directory=script_directory, entry_point=entry_point, logger=logger, transformed_data_context=transformed_data_context) _post_run(result=result, run_id=run_id, automl_settings=automl_settings, logger=logger) except Exception as e: logger.error("driver_wrapper meets exceptions. {}".format(e)) log_traceback(e, logger) raise Exception(e) logger.info( "[RunId:{}]: remote automl driver finishes.".format(run_id)) return result
def setup_wrapper(script_directory, dataprep_json, entry_point, automl_settings, task_type, preprocess, enable_subsampling, num_iterations, **kwargs): automl_settings_obj = _AutoMLSettings.from_string_or_dict( automl_settings) logger, sdk_has_custom_dimension_logger = _init_logger( automl_settings_obj) try: child_run_id = Run.get_submitted_run()._run_id parent_run_id = _get_parent_run_id(child_run_id) if sdk_has_custom_dimension_logger: logger.update_default_properties({ "parent_run_id": parent_run_id, "child_run_id": child_run_id }) logger.info("[ParentRunId:{}]: remote setup script begins.".format( parent_run_id)) script_directory = _init_directory(directory=script_directory, logger=logger) logger.info("Preparing data for set problem info now.") fit_iteration_parameters_dict = _prepare_data( dataprep_json=dataprep_json, automl_settings_obj=automl_settings_obj, script_directory=script_directory, entry_point=entry_point, logger=logger) fit_iteration_parameters_dict = _get_auto_cv_dict( fit_iteration_parameters_dict, automl_settings_obj, logger) print("Setting Problem Info now.") _set_problem_info_for_setup( fit_iteration_parameters_dict=fit_iteration_parameters_dict, automl_settings_obj=automl_settings_obj, task_type=task_type, preprocess=preprocess, enable_subsampling=enable_subsampling, num_iterations=num_iterations, logger=logger) except Exception as e: logger.error("setup_wrapper meets exceptions. {}".format(e)) log_traceback(e, logger) raise Exception(e) _post_setup(logger=logger) logger.info("[ParentRunId:{}]: remote setup script finishes.".format( parent_run_id)) return # PLACEHOLDER for RemoteScript helper functions
def _post_run(result, run_id, automl_settings, logger): print("for Run Id : ", run_id) print("result : ", result) if len(result['errors']) > 0: err_type = next(iter(result['errors'])) inner_ex = result['errors'][err_type]['exception'] inner_ex.error_type = ErrorTypes.Client log_traceback(inner_ex, logger) raise RuntimeError(inner_ex) from inner_ex score = result[automl_settings['primary_metric']] duration = result['fit_time'] print("Score : ", score) print("Duration : ", duration) print("Childrun completed successfully!") logger.info("Childrun completed successfully!")
def _set_problem_info_for_setup(fit_iteration_parameters_dict, automl_settings_obj, task_type, preprocess, enable_subsampling, num_iterations, logger): current_run = Run.get_submitted_run() logger.info("Start to set problem info for the setup for run id {}.".format(current_run._run_id)) logger.info("Setup experiment.") try: experiment = current_run.experiment parent_run_id = _get_parent_run_id(current_run._run_id) data_store = experiment.workspace.get_default_datastore() found_data_store = True logger.info("Using data store.") except Exception as e: logger.warning("Getting data store, fallback to default {}".format(e)) found_data_store = False logger.info("Caching supported {}.".format(sdk_has_cache_capability and found_data_store)) print("caching supported {}".format(sdk_has_cache_capability and found_data_store)) if sdk_has_validate_data_dict: # The newest version of validate_training_data_dict should contains check_x_y logger.info("Using validate_training_data_dict now.") validate_training_data_dict(data_dict=fit_iteration_parameters_dict, automl_settings=automl_settings_obj) else: logger.info("Using validate_training_data now.") validate_training_data(X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), sample_weight=fit_iteration_parameters_dict.get('sample_weight'), sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'), cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'), automl_settings=automl_settings_obj) check_x_y(fit_iteration_parameters_dict.get('X'), fit_iteration_parameters_dict.get('y'), automl_settings_obj) if sdk_has_cache_capability and found_data_store: data_splits_validated = True try: start = time.time() transformed_data_context = _get_transformed_data_context( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), sample_weight=fit_iteration_parameters_dict.get('sample_weight'), sample_weight_valid=fit_iteration_parameters_dict.get('sample_weight_valid'), x_raw_column_names=fit_iteration_parameters_dict.get('x_raw_column_names'), cv_splits_indices=fit_iteration_parameters_dict.get('cv_splits_indices'), automl_settings_obj=automl_settings_obj, data_store=data_store, run_target='remote', parent_run_id=parent_run_id, logger=logger ) end = time.time() print("time taken for transform {}".format(end-start)) logger.info("time taken for transform {}".format(end-start)) if sdk_has_validate_data_splits: try: logger.info("Validating data splits now.") _validate_data_splits(X=transformed_data_context.X, y=transformed_data_context.y, X_valid=transformed_data_context.X_valid, y_valid=transformed_data_context.y_valid, cv_splits=transformed_data_context.cv_splits, automl_settings=automl_settings_obj) data_splits_validated = True except Exception as data_split_exception: data_splits_validated = False logger.error("Meeting validation errors {}.".format(data_split_exception)) log_traceback(data_split_exception, logger) raise data_split_exception logger.info("Start setting problem info.") automl.set_problem_info(transformed_data_context.X, transformed_data_context.y, automl_settings_obj.task_type, current_run=current_run, preprocess=automl_settings_obj.preprocess, lag_length=automl_settings_obj.lag_length, transformed_data_context=transformed_data_context, enable_cache=automl_settings_obj.enable_cache, subsampling=enable_subsampling) except Exception as e: if sdk_has_validate_data_splits and not data_splits_validated: logger.error("sdk_has_validate_data_splits is True and data_splits_validated is False {}.".format(e)) log_traceback(e, logger) raise e else: logger.warning("Setup failed, fall back to old model {}".format(e)) print("Setup failed, fall back to old model {}".format(e)) automl.set_problem_info( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), task_type=task_type, current_run=current_run, preprocess=preprocess, subsampling=enable_subsampling ) else: logger.info("Start setting problem info using old model.") if sdk_has_validate_data_splits: _validate_data_splits(X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), cv_splits=fit_iteration_parameters_dict.get('cv_splits_indices'), automl_settings=automl_settings_obj) automl.set_problem_info( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), task_type=task_type, current_run=current_run, preprocess=preprocess, subsampling=enable_subsampling )
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj['datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values except ImportError: logger.info("SDK version does not support column names extraction, fallback to old path") fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X) try: fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e): logger.debug("User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")