示例#1
0
    def __exit__(self, *exc_details):
        """Upload files for datastore.

        :param exc_details:
        :return:
        """
        from azureml.core.datastore import Datastore
        from azureml.data._dataprep_helper import dataprep

        module_logger.debug("Enter __exit__ function of datastore cmgr")
        for key, value in self._config.items():
            df_config, force_read = self._to_data_reference_config(value)
            if self._is_upload(df_config):
                self._validate_config(df_config, key)
                ds = Datastore(workspace=self._workspace,
                               name=df_config.data_store_name)
                if os.path.isdir(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload dir."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_directory(
                            dataprep().api.engineapi.typedefinitions.
                            UploadDirectoryMessageArguments(
                                base_path=df_config.path_on_compute,
                                folder_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite,
                                concurrent_task_count=1))
                    else:
                        ds.upload(src_dir=df_config.path_on_compute,
                                  target_path=df_config.path_on_data_store,
                                  overwrite=df_config.overwrite)
                elif os.path.isfile(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload file."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_file(
                            dataprep().api.engineapi.typedefinitions.
                            UploadFileMessageArguments(
                                base_path=os.path.dirname(
                                    df_config.path_on_compute),
                                local_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite))
                    else:
                        ds.upload_files(
                            files=[df_config.path_on_compute],
                            target_path=df_config.path_on_data_store,
                            overwrite=df_config.overwrite)
        module_logger.debug("Exit __exit__ function of datastore cmgr")
    def upload_dataset(self, dataset_name: str, local_folder: str, datastore_name: str = None, overwrite: bool = False, tags: dict = None) -> pd.DataFrame:
        '''
        Uploads data from a local directory into an AzureML Datastore that points to Azure Data lake
        Args:
            dataset_name (str): The name of the dataset to register
            local_folder (str): The location of the local directory to take files from
            datastore_path (str): The name of a DataStore that will contain the dataset
        Returns:
            FileDataset: The registered dataset, containing the files
        '''
        if not datastore_name:
            # No datastore name is given, so we'll take the default one
            datastore_name = self.__datastore_path

        # Connecting data store
        datastore = Datastore(self.__workspace, name=datastore_name)

        # TODO : check type of datastore
        datastore.upload(local_folder, dataset_name, overwrite, True)
        
        datastore_paths = [(datastore, dataset_name)]
        file_ds = Dataset.File.from_files(path=datastore_paths)

        file_ds = file_ds.register(workspace=self.__workspace,
                                 name=dataset_name,
                                 description=dataset_name, 
                                 tags = tags, create_new_version=True)
示例#3
0
# In[ ]:


# Default datastore (Azure file storage)
def_file_store = ws.get_default_datastore() 
print("Default datastore's name: {}".format(def_file_store.name))
def_blob_store = Datastore(ws, "workspaceblobstore")
print("Blobstore's name: {}".format(def_blob_store.name))


# In[ ]:


# Upload the raw training data to the blob storage
def_blob_store.upload(src_dir=data_location, 
                      target_path='nyc-taxi-raw-features', 
                      overwrite=True, 
                      show_progress=True)

raw_train_data = DataReference(datastore=def_blob_store, 
                                      data_reference_name="nyc_taxi_raw_features", 
                                      path_on_datastore="nyc-taxi-raw-features/nyc-taxi-sample-data.csv")
print("DataReference object created")


# ### Create the Process Training Data Pipeline Step

# The intermediate data (or output of a Step) is represented by PipelineData object. PipelineData can be produced by one step and consumed in another step by providing the PipelineData object as an output of one step and the input of one or more steps.
# 
# The process training data pipeline step takes the raw_train_data DataReference object as input, and it will output an intermediate PipelineData object that holds the processed training data with the new engineered features for datetime components: hour of the day, and day of the week.
# 
# Review and run the cell below to construct the PipelineData objects and the PythonScriptStep pipeline step:
示例#4
0
#%% first save the files to disk
if (not os.path.exists("./Upload")):
    os.mkdir("./Upload")
    os.mkdir("./Upload/Data")
    os.mkdir("./Upload/Model")
    
df_pca.to_csv("./Upload/Data/data.csv", index=False)
pickle.dump( pca_model, open( "./Upload/Model/model.pkl", "wb" ) )

#%% now you can upload that directory to blobstorage
# I use the date to diferentiate the different versions
blob_path = f"Campus_Recruitment/{datetime.now().strftime('%Y-%m-%d')}"# if None will upload to root
local_path = "./Upload/Data"

blob_store.upload(src_dir=local_path, 
                  target_path=blob_path,
                  overwrite=True, 
                  show_progress=True)

#%% 
# ** Register the data as a dataset **
# %% now that the data is up on the blobstore we can register it as a dataset 
# to keep track of its versions and make it easily acessible
dataset = Dataset.File.from_files( blob_store.path(blob_path + "/data.csv") )
dataset.register(ws, 
                 name="Campus_Recruitment_PCA_Training_Data",
                 create_new_version=True)

#%% 
# ** Upload and register the model as a Model **
#%% 
model = Model.register(workspace=ws,
示例#5
0
# split the data using scikit-learn
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=float(test_size),
                                                    random_state=101)
# join train and train label; same for test
train = pd.concat([X_train, y_train], axis=1)
validation = pd.concat([X_test, y_test], axis=1)

# make sure folder_name was passed in as an argument
if not (folder_name is None):
    os.makedirs("files", exist_ok=True)
    print("%s created" % folder_name)

    # set the target path of the datastore to hold
    # test and validation datasets
    current_folder = str(datetime.now().date())

    target_path = os.path.join(folder_name, current_folder)

    train_file = os.path.join("files", train_file_name)
    val_file = os.path.join("files", val_file_name)
    # save the dataframes to the local drive to the upload the contents of thefolder
    train.to_csv(train_file, header=True, index=False)
    validation.to_csv(val_file, header=True, index=False)
    datastore.upload("files",
                     target_path=target_path,
                     overwrite=True,
                     show_progress=False)