# subscription_id = config.subscription_id # resource_group = config.resource_group # workspace_name = config.workspace_name ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name) # cluster_name= config.cluster_name ct = ComputeTarget(workspace=ws, name=cluster_name) # datastore_name =config.datastore_name ds = Datastore(workspace=ws, name=datastore_name) workdir = os.path.realpath('.')[os.path.realpath('.').find('FixMatch-pytorch'):] workdir = workdir.replace('\\', '/') script_params = { "--workdir": ds.path('/projects/'+workdir).as_mount(), # REQUIRED !!! "--cxk_volna": ds.path('/').as_mount(), "--exp_name": workdir.split('/')[-1], } def make_container_registry(address, username, password): cr = ContainerRegistry() cr.address = address cr.username = username cr.password = password return cr estimator = PyTorch(source_directory='./', script_params=script_params, compute_target=ct,
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj['datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values except ImportError: logger.info("SDK version does not support column names extraction, fallback to old path") fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X) try: fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e): logger.debug("User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")
#%% now you can upload that directory to blobstorage # I use the date to diferentiate the different versions blob_path = f"Campus_Recruitment/{datetime.now().strftime('%Y-%m-%d')}"# if None will upload to root local_path = "./Upload/Data" blob_store.upload(src_dir=local_path, target_path=blob_path, overwrite=True, show_progress=True) #%% # ** Register the data as a dataset ** # %% now that the data is up on the blobstore we can register it as a dataset # to keep track of its versions and make it easily acessible dataset = Dataset.File.from_files( blob_store.path(blob_path + "/data.csv") ) dataset.register(ws, name="Campus_Recruitment_PCA_Training_Data", create_new_version=True) #%% # ** Upload and register the model as a Model ** #%% model = Model.register(workspace=ws, model_name='Campus_Recruitment_PCA', # Name of the registered model in your workspace. model_path='./Upload/Model/model.pkl', # Local file to upload and register as a model. sample_input_dataset=dataset, sample_output_dataset=None, description='PCA model for dimention reduction of the Campus Recruitment Dataset',
ct = ComputeTarget(workspace=ws, name="cpucluster-II") ds = Datastore(workspace=ws, name="hellotfstore") # # Create an estimator. # # Single node est_1 = Estimator( compute_target=compute_target, use_gpu=False, node_count=1, pip_packages=['tensorflow==1.13.1'], source_directory="../", entry_script="mnist-mlp.py", script_params={"--data-dir": ds.path("data/mnist").as_mount()}) # Distributed with PS architecture #est_2 = ... # Distributed with Horovod est_3 = Estimator( compute_target=compute_target, use_gpu=False, node_count=2, process_count_per_node=2, distributed_backend='mpi', pip_packages=['tensorflow==1.13.1', 'horovod'], source_directory="../", entry_script="mnist-mlp-dist-hvd.py", script_params={"--data-dir": ds.path("data/mnist").as_mount()})
# In[ ]: # Module select_columns_in_dataset = Module.load(ws, namespace='azureml', name='Select Columns in Dataset') clean_missing_data = Module.load(ws, namespace='azureml', name='Clean Missing Data') split_data = Module.load(ws, namespace='azureml', name='Split Data') join_data = Module.load(ws, namespace='azureml', name='Join Data') # Dataset try: dset = Dataset.get_by_name(ws, 'Automobile_price_data_(Raw)') except Exception: global_datastore = Datastore(ws, name="azureml_globaldatasets") dset = Dataset.File.from_files(global_datastore.path('GenericCSV/Automobile_price_data_(Raw)')) dset.register(workspace=ws, name='Automobile_price_data_(Raw)', create_new_version=True) blob_input_data = dset # In[ ]: # sub pipeline: TODO improve this experience @dsl.pipeline(name='sub sub', description='sub') def sub_sub_pipeline(minimum_missing_value_ratio): module1 = select_columns_in_dataset( dataset=blob_input_data, select_columns="{\"isFilter\":true,\"rules\":[{\"exclude\":false,\"ruleType\":\"AllColumns\"},"
ejoin_module_func = Module.register( ws, os.path.join('modules', 'ejoin', 'amlmodule.yaml')) eselect_module_func = Module.register( ws, os.path.join('modules', 'eselect', 'amlmodule.yaml')) join_data_module_func = Module.load(ws, namespace='azureml', name='Join Data') train_svd_recommender_module_func = Module.load(ws, namespace='azureml', name='Train SVD Recommender') # datasets input1 = Dataset.get_by_name(ws, 'query data (large)') input2 = Dataset.get_by_name(ws, 'query data (small)') global_datastore = Datastore(ws, name="azureml_globaldatasets") movie_ratings_data = Dataset.File.from_files( global_datastore.path('GenericCSV/Movie_Ratings')).as_named_input( 'Movie_Ratings') imdb_movie_titles_data = Dataset.File.from_files( global_datastore.path('GenericCSV/IMDB_Movie_Titles')).as_named_input( 'IMDB_Movie_Titles') # In[ ]: # steps ejoin = ejoin_module_func().set_parameters( leftcolumns='m:query;querId', # missing 'rightcolumns' parameter leftkeys='m:query', rightkeys='m:Query', jointype='HashInner').set_inputs(left_input=input1, right_input=input2)
def main(): # Ger our configs with open("ptgnn/authentication.json") as jsonFile: authData = json.load(jsonFile)[args.auth_cluster] # Copy the convertCorpus script here. Done so we don't upload the corpus to Azure, or keep a copy of the script in here. # (It's weird, I know. It works and has a purpose though) convertCorpusLocation = Path("../convertCorpusForML.py") convertCorpusAzureLocation = Path("./convertCorpusForML.py") shutil.copy(convertCorpusLocation, convertCorpusAzureLocation) # Grab the authentication data from the JSON file subID = authData["subID"] # Get from Azure Portal; used for billing resGroup = authData["resGroup"] # Name for the resource group wsName = authData["wsName"] # Name for the workspace, which is the collection of compute clusters + experiments computeName = authData["computeName"] # Name for computer cluster datastoreName = authData["datastoreName"] # Get the workspace, the compute target and the datastore ws = Workspace.get(wsName, subscription_id=subID, resource_group=resGroup) computeTarget = ComputeTarget(ws, computeName) datastore = Datastore(ws, name=datastoreName) # Download the entire corpus to the compute target. Save the DataReference obj here # as_mount is also possible, but slows things down due to network opening of files corpus_location = datastore.path(args.aml_location).as_download() output_location = "./" # The files that will be uploaded for usage by our script (everything in the azure folder) source_directory = "." # params for the script params = { "--corpus_location": corpus_location, "--output_folder": output_location, "--aml": "", "--training_percent": args.training_percent, "--validation_percent": args.validation_percent, "-c": "" } if args.log_num is not None: params["-l"] = args.log_num tags = { "logs": str(args.log_num) } else: tags = { "logs": "MAX" } if args.statement_generation: params["-s"] = "" tags["generationType"] = "Statement" else: tags["generationType"] = "Severity" # Set up the estimator object. Note the inputs element, it tells azure that corpus_location in params # will be a DataReference Object. est = Estimator(source_directory=source_directory, compute_target=computeTarget, entry_script='convertCorpusForML.py', script_params=params, inputs=[corpus_location], conda_packages=["pip"], pip_packages=["azureml-core", "tqdm", "numpy", "protobuf"], use_docker=True, use_gpu=False) # Start the experiment run = Experiment(ws, args.exp_name).submit(config=est, tags=tags) # remove the copy of convertCorpus (Remember, don't question this) convertCorpusAzureLocation.unlink() # print out the portral URL # print("Portal URL: ", run.get_portal_url()) # this will stream everything that the compute target does. print("Experiment Started. Remember you can exit out of this program but the experiment will still run on Azure!") run.wait_for_completion(show_output=True)