示例#1
0
    def __init__(self):
        self._parser = argparse.ArgumentParser("train")
        self._parser.add_argument(
            "--release_id",
            type=str,
            help="The ID of the release triggering this pipeline run")
        self._parser.add_argument("--model_name",
                                  type=str,
                                  help="Name of the tf model")
        self._parser.add_argument("--ckpt_path",
                                  type=str,
                                  help="Chekpoint path",
                                  default="checkpoint/yolov3.ckpt")
        self._parser.add_argument("--datastore",
                                  type=str,
                                  help="Name of the datastore",
                                  default="epis_datastore")
        self._parser.add_argument("--storage_container",
                                  type=str,
                                  help="Name of the storage container",
                                  default="ppe")

        self._args = self._parser.parse_args()
        self._run = Run.get_context()
        self._exp = self._run.experiment
        self._ws = self._run.experiment.workspace
        self._tb = Tensorboard([self._run])
        self._datastore = Datastore.get(self._ws,
                                        datastore_name=self._args.datastore)
示例#2
0
def monitor_run_tensorboard(
        run=None,
        local_directory=None,
        # We enforce a logger
        logger=None):

    try:
        from azureml.tensorboard import Tensorboard
    except ImportError as e:
        logger.debug("tensorboard import exception: {}".format(e))
        raise ImportError("Couldn't import the tensorboard functionality. "
                          "Please ensure 'azureml-tensorboard' is installed")

    local_root = os.path.abspath(local_directory)
    logger.debug("Staging tensorboard files in %s", local_root)
    tb = Tensorboard(run, local_root=local_root)
    tb.start(start_browser=True)

    tb._tb_proc.communicate()  # don't use wait() to avoid deadlock

    return None
示例#3
0
def tensorboard(runs):
    """ Returns Tensorboard object instantiated with one or more runs

    You can start Tensorboard session by calling start on Tensorboard object
    To stop simply call stop on same object
    Args:
        runs (azureml.core.script_run.ScriptRun or list):

    Returns:
        azureml.tensorboard.Tensorboard

    Examples:
        >>> tb = tensorboard(runs)
        >>> tb.start() # Start Tensorboard
        >>> tb.stop() # Stop Tensorboard
    """
    logger = logging.getLogger(__name__)
    logger.info(f"Starting tensorboard {pformat(runs)}")
    if isinstance(runs, list):
        return Tensorboard(runs)
    else:
        return Tensorboard([runs])
def monitor(monitor_config: AMLTensorBoardMonitorConfig,
            azure_config: AzureConfig) -> None:
    """
    Starts TensorBoard monitoring as per the provided arguments.
    :param monitor_config: The config containing information on which runs that need be monitored.
    :param azure_config: An AzureConfig object with secrets/keys to access the workspace.
    """
    # Fetch AzureML workspace and the experiment runs in it
    workspace = azure_config.get_workspace()

    if monitor_config.run_ids is not None:
        if len(monitor_config.run_ids) == 0:
            print("At least one run_recovery_id must be given for monitoring.")
            sys.exit(1)
        exp_runs = [
            azure_util.fetch_run(workspace, run_id)
            for run_id in monitor_config.run_ids
        ]
    else:
        if monitor_config.experiment_name not in workspace.experiments:
            print(f"The experiment: {monitor_config.experiment_name} doesn't "
                  f"exist in the {monitor_config.workspace_name} workspace.")
            sys.exit(1)

        experiment = Experiment(workspace, monitor_config.experiment_name)
        filters = common_util.get_items_from_string(
            monitor_config.run_status) if monitor_config.run_status else []

        exp_runs = azure_util.fetch_runs(experiment, filters)

        if len(exp_runs) == 0:
            _msg = "No runs to monitor"
            if monitor_config.run_status:
                _msg += f"with status [{monitor_config.run_status}]."
            print(_msg)
            sys.exit(1)

    # Start TensorBoard on executing machine
    ts = Tensorboard(exp_runs,
                     local_root=str(monitor_config.local_root),
                     port=monitor_config.port)

    print(
        "=============================================================================="
    )
    for run in exp_runs:
        print(f"Run URL: {run.get_portal_url()}")
    print("TensorBoard URL: ")
    ts.start()
    print(
        "==============================================================================\n\n"
    )
    input("Press Enter to close TensorBoard...")
    ts.stop()
示例#5
0
class Train():
    def __init__(self):
        self._parser = argparse.ArgumentParser("train")
        self._parser.add_argument(
            "--release_id",
            type=str,
            help="The ID of the release triggering this pipeline run")
        self._parser.add_argument("--model_name",
                                  type=str,
                                  help="Name of the tf model")
        self._parser.add_argument("--ckpt_path",
                                  type=str,
                                  help="Chekpoint path",
                                  default="checkpoint/yolov3.ckpt")
        self._parser.add_argument("--datastore",
                                  type=str,
                                  help="Name of the datastore",
                                  default="epis_datastore")
        self._parser.add_argument("--storage_container",
                                  type=str,
                                  help="Name of the storage container",
                                  default="ppe")

        self._args = self._parser.parse_args()
        self._run = Run.get_context()
        self._exp = self._run.experiment
        self._ws = self._run.experiment.workspace
        self._tb = Tensorboard([self._run])
        self._datastore = Datastore.get(self._ws,
                                        datastore_name=self._args.datastore)

    def __get_mime_type(self, file_path):
        return mime_content_type(file_path)

    def training(self):

        self.__getDataset()

        trainset = Dataset('train')
        logdir = "./data/log"
        steps_per_epoch = len(trainset)
        global_steps = tf.Variable(1, trainable=False, dtype=tf.int64)
        warmup_steps = cfg.TRAIN.WARMUP_EPOCHS * steps_per_epoch
        total_steps = cfg.TRAIN.EPOCHS * steps_per_epoch

        input_tensor = tf.keras.layers.Input([416, 416, 3])
        conv_tensors = YOLOv3(input_tensor)

        output_tensors = []
        for i, conv_tensor in enumerate(conv_tensors):
            pred_tensor = decode(conv_tensor, i)
            output_tensors.append(conv_tensor)
            output_tensors.append(pred_tensor)

        model = tf.keras.Model(input_tensor, output_tensors)
        optimizer = tf.keras.optimizers.Adam()
        if os.path.exists(logdir): shutil.rmtree(logdir)
        writer = tf.summary.create_file_writer(logdir)

        self._tb.start()
        for epoch in range(cfg.TRAIN.EPOCHS):
            print(epoch)
            for image_data, target in trainset:
                self.__train_step(image_data, target, model, global_steps,
                                  writer, optimizer, warmup_steps, total_steps)
            model.save_weights(self._args.ckpt_path)
        self._tb.stop()
        model.save(f"./models")

        zipFolder("check.zip", "checkpoint")
        zipFolder("log.zip", "data/log")
        zipFolder("model.zip", "models")

        self._run.upload_file(name='check.zip', path_or_stream="check.zip")
        print(
            f"Uploaded the checkpoints to experiment {self._run.experiment.name}"
        )
        self._run.upload_file(name='log.zip', path_or_stream="log.zip")
        print(f"Uploaded the tfruns to experiment {self._run.experiment.name}")
        self._run.upload_file(name='model.zip', path_or_stream="model.zip")
        print(f"Uploaded the model to experiment {self._run.experiment.name}")

        print("Following files are uploaded")
        print(self._run.get_file_names())

        self._run.add_properties({
            "release_id": self._args.release_id,
            "run_type": "train"
        })
        print(f"added properties: {self._run.properties}")

        self._run.complete()

    def __getDataset(self):
        voc_train = self._datastore.blob_service.list_blobs(
            self._args.storage_container, prefix='voc_train.txt')
        voc_test = self._datastore.blob_service.list_blobs(
            self._args.storage_container, prefix='voc_test.txt')

        voc_train_imagesets = list(voc_train)
        print("Succesfully get voc_train.txt")
        voc_test_imagesets = list(voc_test)
        print("Succesfully get voc_test.txt")

        self._datastore.blob_service.get_blob_to_path(
            self._args.storage_container, voc_train_imagesets[0].name,
            f'./data/dataset/{voc_train_imagesets[0].name}')
        self._datastore.blob_service.get_blob_to_path(
            self._args.storage_container, voc_test_imagesets[0].name,
            f'./data/dataset/{voc_test_imagesets[0].name}')

    def __train_step(self, image_data, target, model, global_steps, writer,
                     optimizer, warmup_steps, total_steps):
        with tf.GradientTape() as tape:
            pred_result = model(image_data, training=True)
            giou_loss = conf_loss = prob_loss = 0

            for i in range(3):
                conv, pred = pred_result[i * 2], pred_result[i * 2 + 1]
                loss_items = compute_loss(pred, conv, *target[i], i)
                giou_loss += loss_items[0]
                conf_loss += loss_items[1]
                prob_loss += loss_items[2]

            total_loss = giou_loss + conf_loss + prob_loss

            gradients = tape.gradient(total_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))
            tf.print(
                "=> STEP %4d   lr: %.6f   giou_loss: %4.2f   conf_loss: %4.2f   "
                "prob_loss: %4.2f   total_loss: %4.2f" %
                (global_steps, optimizer.lr.numpy(), giou_loss, conf_loss,
                 prob_loss, total_loss))
            global_steps.assign_add(1)
            if global_steps < warmup_steps:
                lr = global_steps / warmup_steps * cfg.TRAIN.LR_INIT
            else:
                lr = cfg.TRAIN.LR_END + 0.5 * (
                    cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * ((1 + tf.cos(
                        (global_steps - warmup_steps) /
                        (total_steps - warmup_steps) * np.pi)))
            optimizer.lr.assign(lr.numpy())

            with writer.as_default():
                tf.summary.scalar("lr", optimizer.lr, step=global_steps)
                tf.summary.scalar("loss/total_loss",
                                  total_loss,
                                  step=global_steps)
                tf.summary.scalar("loss/giou_loss",
                                  giou_loss,
                                  step=global_steps)
                tf.summary.scalar("loss/conf_loss",
                                  conf_loss,
                                  step=global_steps)
                tf.summary.scalar("loss/prob_loss",
                                  prob_loss,
                                  step=global_steps)
            writer.flush()
                        nargs='+',
                        default=None,
                        help='runids to create')

    return parser.parse_args()


args = parse_args()

print(args)

if args.runids:
    # get workspace
    ws = Workspace.from_config()

    # set the expiriment
    experiment_name = 'test'
    exp = Experiment(workspace=ws, name=experiment_name)

    runs = []
    for idx in args.runids:
        run = Run(exp, idx)
        runs.append(run)
    tb = Tensorboard(runs)
    tb.start()

    ## Wait for input to stop tensorboard.
    print('Enter to stop tensorboard')
    input()
    tb.stop()
def main():
    """
    Run the experiment for training
    """
    work_space = Workspace.from_config()

    # Set up the dataset for training
    datastore = work_space.get_default_datastore()
    dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist"))

    # Set up the experiment for training
    experiment = Experiment(workspace=work_space, name="keras-lenet-train")
    #     azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000
    config = ScriptRunConfig(
        source_directory=".",
        script="train_keras.py",
        compute_target="cpu-cluster",
        arguments=[
            "--data_folder",
            dataset.as_named_input("input").as_mount(),
        ],
    )

    # Set up the Tensoflow/Keras environment
    environment = Environment("keras-environment")
    environment.python.conda_dependencies = CondaDependencies.create(
        python_version="3.7.7",
        pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"])
    config.run_config.environment = environment

    # Run the experiment for training
    run = experiment.submit(config)
    aml_url = run.get_portal_url()
    print(
        "Submitted to an Azure Machine Learning compute cluster. Click on the link below"
    )
    print("")
    print(aml_url)

    tboard = Tensorboard([run])
    # If successful, start() returns a string with the URI of the instance.
    tboard.start(start_browser=True)
    run.wait_for_completion(show_output=True)
    # After your job completes, be sure to stop() the streaming otherwise it will continue to run.
    print("Press enter to stop")
    input()
    tboard.stop()

    # Register Model
    metrics = run.get_metrics()
    run.register_model(
        model_name="keras_mnist",
        tags={
            "data": "mnist",
            "model": "classification"
        },
        model_path="outputs/keras_lenet.h5",
        model_framework=Model.Framework.TENSORFLOW,
        model_framework_version="2.3.1",
        properties={
            "train_loss": metrics["train_loss"][-1],
            "train_accuracy": metrics["train_accuracy"][-1],
            "val_loss": metrics["val_loss"][-1],
            "val_accuracy": metrics["val_accuracy"][-1],
        },
    )
示例#8
0
def tensorboard():
    args = parse_args()
    tb = Tensorboard([args.run])
    # If successful, start() returns a string with the URI of the instance.
    tb.start()
#               callbacks=[
#                   AmlLogger(),
#                   tf.keras.callbacks.TensorBoard(update_freq='batch')]
#              )
# ```
#
# #### Launch Tensorboard
# Azure ML service provides built-in integration with Tensorboard through **tensorboard** package.
#
# While the run is in progress (or after it has completed), we can start Tensorboard with the run as its target, and it will begin streaming logs.

# %%
from azureml.tensorboard import Tensorboard

# The Tensorboard constructor takes an array of runs, so be sure and pass it in as a single-element array here
tb = Tensorboard([run2])

# If successful, start() returns a string with the URI of the instance.
tb.start()

# %% [markdown]
# #### Stop Tensorboard
# When you're done, make sure to call the stop() method of the Tensorboard object, or it will stay running even after your job completes.

# %%
tb.stop()

# %% [markdown]
# ## Check the model performance
#
# Last training run produced model of decent accuracy. Let's test it out and see what it does. First, let's check what files our latest training run produced and download the model files.
示例#10
0
from azure.common.client_factory import get_client_from_cli_profile
from azure.mgmt.resource import SubscriptionClient
from azureml.core import Experiment
from azureml.core import Workspace
from azureml.core.authentication import AzureCliAuthentication
from azureml.tensorboard import Tensorboard

cli_auth = AzureCliAuthentication()
subscription_client = get_client_from_cli_profile(SubscriptionClient)
subscription_id = next(
    subscription_client.subscriptions.list()).subscription_id

ws = Workspace(
    subscription_id=subscription_id,
    resource_group="ds_envs_RG",
    workspace_name="ds_envs_ws",
    auth=cli_auth,
)
experiment_name = "my_experiment"
run_id = "my_experiment_1603471452_ed6739ca"
experiment = Experiment(workspace=ws, name=experiment_name)
run = [i for i in experiment.get_runs() if i.id == run_id][0]
tb = Tensorboard([run])
tb.start(start_browser=True)
input("Press Enter to continue...")
tb.stop()