def __init__(self): self._parser = argparse.ArgumentParser("train") self._parser.add_argument( "--release_id", type=str, help="The ID of the release triggering this pipeline run") self._parser.add_argument("--model_name", type=str, help="Name of the tf model") self._parser.add_argument("--ckpt_path", type=str, help="Chekpoint path", default="checkpoint/yolov3.ckpt") self._parser.add_argument("--datastore", type=str, help="Name of the datastore", default="epis_datastore") self._parser.add_argument("--storage_container", type=str, help="Name of the storage container", default="ppe") self._args = self._parser.parse_args() self._run = Run.get_context() self._exp = self._run.experiment self._ws = self._run.experiment.workspace self._tb = Tensorboard([self._run]) self._datastore = Datastore.get(self._ws, datastore_name=self._args.datastore)
def monitor_run_tensorboard( run=None, local_directory=None, # We enforce a logger logger=None): try: from azureml.tensorboard import Tensorboard except ImportError as e: logger.debug("tensorboard import exception: {}".format(e)) raise ImportError("Couldn't import the tensorboard functionality. " "Please ensure 'azureml-tensorboard' is installed") local_root = os.path.abspath(local_directory) logger.debug("Staging tensorboard files in %s", local_root) tb = Tensorboard(run, local_root=local_root) tb.start(start_browser=True) tb._tb_proc.communicate() # don't use wait() to avoid deadlock return None
def tensorboard(runs): """ Returns Tensorboard object instantiated with one or more runs You can start Tensorboard session by calling start on Tensorboard object To stop simply call stop on same object Args: runs (azureml.core.script_run.ScriptRun or list): Returns: azureml.tensorboard.Tensorboard Examples: >>> tb = tensorboard(runs) >>> tb.start() # Start Tensorboard >>> tb.stop() # Stop Tensorboard """ logger = logging.getLogger(__name__) logger.info(f"Starting tensorboard {pformat(runs)}") if isinstance(runs, list): return Tensorboard(runs) else: return Tensorboard([runs])
def monitor(monitor_config: AMLTensorBoardMonitorConfig, azure_config: AzureConfig) -> None: """ Starts TensorBoard monitoring as per the provided arguments. :param monitor_config: The config containing information on which runs that need be monitored. :param azure_config: An AzureConfig object with secrets/keys to access the workspace. """ # Fetch AzureML workspace and the experiment runs in it workspace = azure_config.get_workspace() if monitor_config.run_ids is not None: if len(monitor_config.run_ids) == 0: print("At least one run_recovery_id must be given for monitoring.") sys.exit(1) exp_runs = [ azure_util.fetch_run(workspace, run_id) for run_id in monitor_config.run_ids ] else: if monitor_config.experiment_name not in workspace.experiments: print(f"The experiment: {monitor_config.experiment_name} doesn't " f"exist in the {monitor_config.workspace_name} workspace.") sys.exit(1) experiment = Experiment(workspace, monitor_config.experiment_name) filters = common_util.get_items_from_string( monitor_config.run_status) if monitor_config.run_status else [] exp_runs = azure_util.fetch_runs(experiment, filters) if len(exp_runs) == 0: _msg = "No runs to monitor" if monitor_config.run_status: _msg += f"with status [{monitor_config.run_status}]." print(_msg) sys.exit(1) # Start TensorBoard on executing machine ts = Tensorboard(exp_runs, local_root=str(monitor_config.local_root), port=monitor_config.port) print( "==============================================================================" ) for run in exp_runs: print(f"Run URL: {run.get_portal_url()}") print("TensorBoard URL: ") ts.start() print( "==============================================================================\n\n" ) input("Press Enter to close TensorBoard...") ts.stop()
class Train(): def __init__(self): self._parser = argparse.ArgumentParser("train") self._parser.add_argument( "--release_id", type=str, help="The ID of the release triggering this pipeline run") self._parser.add_argument("--model_name", type=str, help="Name of the tf model") self._parser.add_argument("--ckpt_path", type=str, help="Chekpoint path", default="checkpoint/yolov3.ckpt") self._parser.add_argument("--datastore", type=str, help="Name of the datastore", default="epis_datastore") self._parser.add_argument("--storage_container", type=str, help="Name of the storage container", default="ppe") self._args = self._parser.parse_args() self._run = Run.get_context() self._exp = self._run.experiment self._ws = self._run.experiment.workspace self._tb = Tensorboard([self._run]) self._datastore = Datastore.get(self._ws, datastore_name=self._args.datastore) def __get_mime_type(self, file_path): return mime_content_type(file_path) def training(self): self.__getDataset() trainset = Dataset('train') logdir = "./data/log" steps_per_epoch = len(trainset) global_steps = tf.Variable(1, trainable=False, dtype=tf.int64) warmup_steps = cfg.TRAIN.WARMUP_EPOCHS * steps_per_epoch total_steps = cfg.TRAIN.EPOCHS * steps_per_epoch input_tensor = tf.keras.layers.Input([416, 416, 3]) conv_tensors = YOLOv3(input_tensor) output_tensors = [] for i, conv_tensor in enumerate(conv_tensors): pred_tensor = decode(conv_tensor, i) output_tensors.append(conv_tensor) output_tensors.append(pred_tensor) model = tf.keras.Model(input_tensor, output_tensors) optimizer = tf.keras.optimizers.Adam() if os.path.exists(logdir): shutil.rmtree(logdir) writer = tf.summary.create_file_writer(logdir) self._tb.start() for epoch in range(cfg.TRAIN.EPOCHS): print(epoch) for image_data, target in trainset: self.__train_step(image_data, target, model, global_steps, writer, optimizer, warmup_steps, total_steps) model.save_weights(self._args.ckpt_path) self._tb.stop() model.save(f"./models") zipFolder("check.zip", "checkpoint") zipFolder("log.zip", "data/log") zipFolder("model.zip", "models") self._run.upload_file(name='check.zip', path_or_stream="check.zip") print( f"Uploaded the checkpoints to experiment {self._run.experiment.name}" ) self._run.upload_file(name='log.zip', path_or_stream="log.zip") print(f"Uploaded the tfruns to experiment {self._run.experiment.name}") self._run.upload_file(name='model.zip', path_or_stream="model.zip") print(f"Uploaded the model to experiment {self._run.experiment.name}") print("Following files are uploaded") print(self._run.get_file_names()) self._run.add_properties({ "release_id": self._args.release_id, "run_type": "train" }) print(f"added properties: {self._run.properties}") self._run.complete() def __getDataset(self): voc_train = self._datastore.blob_service.list_blobs( self._args.storage_container, prefix='voc_train.txt') voc_test = self._datastore.blob_service.list_blobs( self._args.storage_container, prefix='voc_test.txt') voc_train_imagesets = list(voc_train) print("Succesfully get voc_train.txt") voc_test_imagesets = list(voc_test) print("Succesfully get voc_test.txt") self._datastore.blob_service.get_blob_to_path( self._args.storage_container, voc_train_imagesets[0].name, f'./data/dataset/{voc_train_imagesets[0].name}') self._datastore.blob_service.get_blob_to_path( self._args.storage_container, voc_test_imagesets[0].name, f'./data/dataset/{voc_test_imagesets[0].name}') def __train_step(self, image_data, target, model, global_steps, writer, optimizer, warmup_steps, total_steps): with tf.GradientTape() as tape: pred_result = model(image_data, training=True) giou_loss = conf_loss = prob_loss = 0 for i in range(3): conv, pred = pred_result[i * 2], pred_result[i * 2 + 1] loss_items = compute_loss(pred, conv, *target[i], i) giou_loss += loss_items[0] conf_loss += loss_items[1] prob_loss += loss_items[2] total_loss = giou_loss + conf_loss + prob_loss gradients = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) tf.print( "=> STEP %4d lr: %.6f giou_loss: %4.2f conf_loss: %4.2f " "prob_loss: %4.2f total_loss: %4.2f" % (global_steps, optimizer.lr.numpy(), giou_loss, conf_loss, prob_loss, total_loss)) global_steps.assign_add(1) if global_steps < warmup_steps: lr = global_steps / warmup_steps * cfg.TRAIN.LR_INIT else: lr = cfg.TRAIN.LR_END + 0.5 * ( cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * ((1 + tf.cos( (global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi))) optimizer.lr.assign(lr.numpy()) with writer.as_default(): tf.summary.scalar("lr", optimizer.lr, step=global_steps) tf.summary.scalar("loss/total_loss", total_loss, step=global_steps) tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps) tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps) tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps) writer.flush()
nargs='+', default=None, help='runids to create') return parser.parse_args() args = parse_args() print(args) if args.runids: # get workspace ws = Workspace.from_config() # set the expiriment experiment_name = 'test' exp = Experiment(workspace=ws, name=experiment_name) runs = [] for idx in args.runids: run = Run(exp, idx) runs.append(run) tb = Tensorboard(runs) tb.start() ## Wait for input to stop tensorboard. print('Enter to stop tensorboard') input() tb.stop()
def main(): """ Run the experiment for training """ work_space = Workspace.from_config() # Set up the dataset for training datastore = work_space.get_default_datastore() dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist")) # Set up the experiment for training experiment = Experiment(workspace=work_space, name="keras-lenet-train") # azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000 config = ScriptRunConfig( source_directory=".", script="train_keras.py", compute_target="cpu-cluster", arguments=[ "--data_folder", dataset.as_named_input("input").as_mount(), ], ) # Set up the Tensoflow/Keras environment environment = Environment("keras-environment") environment.python.conda_dependencies = CondaDependencies.create( python_version="3.7.7", pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"]) config.run_config.environment = environment # Run the experiment for training run = experiment.submit(config) aml_url = run.get_portal_url() print( "Submitted to an Azure Machine Learning compute cluster. Click on the link below" ) print("") print(aml_url) tboard = Tensorboard([run]) # If successful, start() returns a string with the URI of the instance. tboard.start(start_browser=True) run.wait_for_completion(show_output=True) # After your job completes, be sure to stop() the streaming otherwise it will continue to run. print("Press enter to stop") input() tboard.stop() # Register Model metrics = run.get_metrics() run.register_model( model_name="keras_mnist", tags={ "data": "mnist", "model": "classification" }, model_path="outputs/keras_lenet.h5", model_framework=Model.Framework.TENSORFLOW, model_framework_version="2.3.1", properties={ "train_loss": metrics["train_loss"][-1], "train_accuracy": metrics["train_accuracy"][-1], "val_loss": metrics["val_loss"][-1], "val_accuracy": metrics["val_accuracy"][-1], }, )
def tensorboard(): args = parse_args() tb = Tensorboard([args.run]) # If successful, start() returns a string with the URI of the instance. tb.start()
# callbacks=[ # AmlLogger(), # tf.keras.callbacks.TensorBoard(update_freq='batch')] # ) # ``` # # #### Launch Tensorboard # Azure ML service provides built-in integration with Tensorboard through **tensorboard** package. # # While the run is in progress (or after it has completed), we can start Tensorboard with the run as its target, and it will begin streaming logs. # %% from azureml.tensorboard import Tensorboard # The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here tb = Tensorboard([run2]) # If successful, start() returns a string with the URI of the instance. tb.start() # %% [markdown] # #### Stop Tensorboard # When you're done, make sure to call the stop() method of the Tensorboard object, or it will stay running even after your job completes. # %% tb.stop() # %% [markdown] # ## Check the model performance # # Last training run produced model of decent accuracy. Let's test it out and see what it does. First, let's check what files our latest training run produced and download the model files.
from azure.common.client_factory import get_client_from_cli_profile from azure.mgmt.resource import SubscriptionClient from azureml.core import Experiment from azureml.core import Workspace from azureml.core.authentication import AzureCliAuthentication from azureml.tensorboard import Tensorboard cli_auth = AzureCliAuthentication() subscription_client = get_client_from_cli_profile(SubscriptionClient) subscription_id = next( subscription_client.subscriptions.list()).subscription_id ws = Workspace( subscription_id=subscription_id, resource_group="ds_envs_RG", workspace_name="ds_envs_ws", auth=cli_auth, ) experiment_name = "my_experiment" run_id = "my_experiment_1603471452_ed6739ca" experiment = Experiment(workspace=ws, name=experiment_name) run = [i for i in experiment.get_runs() if i.id == run_id][0] tb = Tensorboard([run]) tb.start(start_browser=True) input("Press Enter to continue...") tb.stop()