def model_main(activation='relu', width=1000, depth=2, lr=0.5, **kwargs): """Main method; this sequences the basic steps often - create the model and optimizer, - train and record metrics, - plot all metrics. """ print(f'Building model with width = {width}, learning rate = {lr}') model = build_model(activation, width, depth) optimizer = tf.optimizers.SGD(lr) with uv.start_run() as r: MLFlowReporter().report_params({ **kwargs, **{ "depth": depth, "width": width, "lr": lr, "activation": activation } }) train_and_log(model, optimizer, **kwargs)
def test_report_param(mock_pubsub, reporter): with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') _reset_experiment() mlflow_cfg = { 'experiment_name': 'foo', 'run_name': 'bar', 'artifact_location': '/foo/bar', } with uv.start_run(**mlflow_cfg) as active_run, uv.active_reporter( reporter()) as r: assert r is not None param = {'a': 3.14159} r.report_param(k='a', v=param['a']) assert mlf.active_run() == active_run # we need to access the run via a client to inspect most params/tags/metrics client = mlf.tracking.MlflowClient() run = client.get_run(active_run.info.run_id) assert run is not None for k, v in param.items(): p = run.data.params assert k in p assert p[k] == str(v)
def test_report_invalid(mock_pubsub, reporter, value): with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') _reset_experiment() mlflow_cfg = { 'experiment_name': 'foo', 'run_name': 'bar', 'artifact_location': '/foo/bar', } with uv.start_run(**mlflow_cfg) as active_run, uv.active_reporter( reporter()) as r: assert r is not None steps = [{ 'step': 1, 'm': { 'a': value, } }] for p in steps: for k, v in p['m'].items(): with pytest.raises(ValueError): r.report(step=p['step'], k=k, v=v)
def test_report(mock_pubsub, reporter): with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') _reset_experiment() mlflow_cfg = { 'experiment_name': 'foo', 'run_name': 'bar', 'artifact_location': '/foo/bar', } with uv.start_run(**mlflow_cfg) as active_run, uv.active_reporter( reporter()) as r: assert r is not None steps = [{ 'step': 1, 'm': { 'a': 3, 'b': 3.141 } }, { 'step': 2, 'm': { 'a': 6, 'b': 6.282 } }] for p in steps: for k, v in p['m'].items(): r.report(step=p['step'], k=k, v=v) assert mlf.active_run() == active_run # we need to access the run via a client to inspect most params/tags/metrics client = mlf.tracking.MlflowClient() run = client.get_run(active_run.info.run_id) assert run is not None metrics = run.data.metrics metric_data = {} # check that the metrics are in the run data for k, v in steps[0]['m'].items(): assert k in metrics metric_data[k] = { x.step: x.value for x in client.get_metric_history(active_run.info.run_id, k) } for s in steps: cur_step = s['step'] for k, v in s['m'].items(): assert metric_data[k][cur_step] == v
def _run_experiments(experiment_name: str): '''performs runs for each of our parameter settings, creating the associated mlflow objects and logging paramters''' for i, p in enumerate(PARAMETERS): with uv.start_run( experiment_name=experiment_name, run_name=f'run_{i}', ): uv.report_params(p) _compute(**p)
def test_report_params(mock_pubsub, reporter): with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') _reset_experiment() mlflow_cfg = { 'experiment_name': 'foo', 'run_name': 'bar', 'artifact_location': '/foo/bar', } with uv.start_run(**mlflow_cfg) as active_run, uv.active_reporter( reporter()) as r: assert r is not None params = { 'a': 3, 'b': 'string_param', INVALID_KEY: INVALID_PARAM_VALUE, } r.report_params(params) assert mlf.active_run() == active_run # we need to access the run via a client to inspect most params/tags/metrics client = mlf.tracking.MlflowClient() run = client.get_run(active_run.info.run_id) assert run is not None for k, v in params.items(): if k == INVALID_KEY: k = SANITIZED_KEY if v == INVALID_PARAM_VALUE: v = SANITIZED_PARAM_VALUE p = run.data.params assert k in p assert p[k] == str(v)
def test_start_run(monkeypatch): with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') # no run should be active initially assert mlf.active_run() is None # test default args with uv.start_run() as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r # test explicit experiment name, run name, artifact location cfg = { 'experiment_name': 'foo', 'run_name': 'bar', 'artifact_location': 'gs://foo/bar', } with uv.start_run(**cfg) as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['run_name'] assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None assert mlf.get_artifact_uri().startswith(cfg['artifact_location']) # test env var experiment name, run name, artifact location cfg = { 'MLFLOW_EXPERIMENT_NAME': 'env_foo', 'MLFLOW_RUN_NAME': 'env_bar', 'MLFLOW_ARTIFACT_ROOT': 'gs://env/foo/bar' } for k, v in cfg.items(): monkeypatch.setenv(k, v) with uv.start_run() as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['MLFLOW_RUN_NAME'] assert mlf.get_experiment_by_name( cfg['MLFLOW_EXPERIMENT_NAME']) is not None assert mlf.get_artifact_uri().startswith(cfg['MLFLOW_ARTIFACT_ROOT']) for k, v in cfg.items(): monkeypatch.delenv(k) # test env var tags cfg = { 'tag0': 'foo', 'tag1': 'bar', } for k, v in cfg.items(): monkeypatch.setenv(f'ENVVAR_{k}', v) with uv.start_run() as r: client = mlf.tracking.MlflowClient() tags = client.get_run(r.info.run_id).data.tags for k, v in cfg.items(): assert k in tags, pp.pformat(tags) assert tags[k] == v, pp.pformat(tags) for k in cfg: monkeypatch.delenv(f'ENVVAR_{k}') # test CAIP tags monkeypatch.setenv('CLOUD_ML_JOB_ID', 'foo_cloud_job') with uv.start_run() as r: client = mlf.tracking.MlflowClient() tags = client.get_run(r.info.run_id).data.tags assert 'cloud_ml_job_details' in tags, pp.pformat(tags) assert tags['cloud_ml_job_details'] == ( 'https://console.cloud.google.com/ai-platform/jobs/foo_cloud_job') assert 'cloud_ml_job_id' in tags, pp.pformat(tags) assert tags['cloud_ml_job_id'] == 'foo_cloud_job' monkeypatch.delenv('CLOUD_ML_JOB_ID')
def main(**kwargs): with uv.start_run(), uv.active_reporter(MLFlowReporter()): uv.report_params(kwargs) _run_training(**kwargs)
def test_start_run(monkeypatch): _reset_experiment() with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') # no run should be active initially assert mlf.active_run() is None # test default args with uv.start_run() as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r # test explicit experiment name, run name, artifact location cfg = { 'experiment_name': 'experiment_0', 'run_name': 'bar', 'artifact_location': '/foo/bar', } with uv.start_run(**cfg) as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['run_name'] assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None assert mlf.get_artifact_uri().startswith(cfg['artifact_location']) # test env var experiment name, run name, path-based artifact location cfg = { 'MLFLOW_EXPERIMENT_NAME': 'env_foo', 'MLFLOW_RUN_NAME': 'env_bar', 'MLFLOW_ARTIFACT_ROOT': '/tmp/foo/bar' } for k, v in cfg.items(): monkeypatch.setenv(k, v) with uv.start_run() as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['MLFLOW_RUN_NAME'] assert mlf.get_experiment_by_name( cfg['MLFLOW_EXPERIMENT_NAME']) is not None assert mlf.get_artifact_uri().startswith(cfg['MLFLOW_ARTIFACT_ROOT']) for k, v in cfg.items(): monkeypatch.delenv(k) # test env var tags cfg = { 'tag0': 'foo', 'tag1': 'bar', } for k, v in cfg.items(): monkeypatch.setenv(f'ENVVAR_{k}', v) with uv.start_run() as r: client = mlf.tracking.MlflowClient() tags = client.get_run(r.info.run_id).data.tags for k, v in cfg.items(): assert k in tags, pp.pformat(tags) assert tags[k] == v, pp.pformat(tags) for k in cfg: monkeypatch.delenv(f'ENVVAR_{k}') # test CAIP tags monkeypatch.setenv('CLOUD_ML_JOB_ID', 'foo_cloud_job') with uv.start_run() as r: client = mlf.tracking.MlflowClient() tags = client.get_run(r.info.run_id).data.tags assert 'cloud_ml_job_details' in tags, pp.pformat(tags) assert tags['cloud_ml_job_details'] == ( 'https://console.cloud.google.com/ai-platform/jobs/foo_cloud_job') assert 'cloud_ml_job_id' in tags, pp.pformat(tags) assert tags['cloud_ml_job_id'] == 'foo_cloud_job' monkeypatch.delenv('CLOUD_ML_JOB_ID') # test case where no gcp project is set with gcs artifact store def mock_default(scopes=None, request=None, quota_project_id=None): return (google.auth.credentials.AnonymousCredentials(), None) monkeypatch.setattr('google.auth.default', mock_default) cfg = { 'experiment_name': 'experiment_1', 'run_name': 'bar', 'artifact_location': 'gs://foo/bar', } with uv.start_run(**cfg) as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['run_name'] assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None assert mlf.get_artifact_uri().startswith( cfg['artifact_location']), mlf.get_artifact_uri() assert os.environ.get('GOOGLE_CLOUD_PROJECT') is not None # test case where gcp project is set with gcs artifact storage def mock_default(scopes=None, request=None, quota_project_id=None): return (google.auth.credentials.AnonymousCredentials(), 'test_project') monkeypatch.setattr('google.auth.default', mock_default) cfg = { 'experiment_name': 'experiment_2', 'run_name': 'bar', 'artifact_location': 'gs://foo/bar', } with uv.start_run(**cfg) as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['run_name'] assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None assert mlf.get_artifact_uri().startswith( cfg['artifact_location']), mlf.get_artifact_uri() # test using existing experiment with different artifact location # - this should use original artifact location cfg = { 'experiment_name': 'experiment_2', 'run_name': 'bar2', 'artifact_location': '/a/b/c', } with uv.start_run(**cfg) as r: active_run = mlf.active_run() assert active_run is not None assert active_run == r assert r.data.tags['mlflow.runName'] == cfg['run_name'] assert mlf.get_experiment_by_name(cfg['experiment_name']) is not None assert not mlf.get_artifact_uri().startswith( cfg['artifact_location']), mlf.get_artifact_uri()
def test_log_metrics(mock_pubsub): '''tests the log_metrics() funtion in mlflow_subscriber''' # test exception handling by passing invalid message and mlflow client log_metrics(None, None, True) with tempfile.TemporaryDirectory() as tmpdir: mlf.set_tracking_uri(f'file:{tmpdir}/foo') _reset_experiment() mlflow_cfg = { 'experiment_name': 'foo', 'run_name': 'bar', 'artifact_location': '/foo/bar', } with uv.start_run(**mlflow_cfg), uv.active_reporter( MLFlowPubsubReporter('p', 't')) as r: active_run = mlf.active_run() steps = [{ 'step': 1, 'm': { 'a': 3, 'b': 3.141 } }, { 'step': 2, 'm': { 'a': 6, 'b': 6.282 } }] client = mlf.tracking.MlflowClient() assert len(_MSG_QUEUE) == 0 for s in steps: r.report_all(**s) assert len(_MSG_QUEUE) == len(steps) for m in _MSG_QUEUE: log_metrics(client, m, True) run = client.get_run(active_run.info.run_id) assert run is not None metrics = run.data.metrics metric_data = {} # check that the metrics are in the run data for k, v in steps[0]['m'].items(): assert k in metrics metric_data[k] = { x.step: x.value for x in client.get_metric_history(active_run.info.run_id, k) } for s in steps: cur_step = s['step'] for k, v in s['m'].items(): assert metric_data[k][cur_step] == v
def main(_): """Builds and trains a sentiment classification RNN.""" # prevent tf from accessing GPU tf.config.experimental.set_visible_devices([], "GPU") # Get and save config config = argparser.parse_args('main') logging.info(json.dumps(config, indent=2)) with uv.start_run( experiment_name=config['save']['mlflow_expname'], run_name=config['save']['mlflow_runname']), uv.active_reporter( MLFlowReporter()): reporters.save_config(config) uv.report_params(reporters.flatten(config)) prng_key = random.PRNGKey(config['run']['seed']) # Load data. vocab_size, train_dset, test_dset = data.get_dataset(config['data']) # Build network. cell = model_utils.get_cell(config['model']['cell_type'], num_units=config['model']['num_units']) init_fun, apply_fun, _, _ = network.build_rnn( vocab_size, config['model']['emb_size'], cell, config['model']['num_outputs']) loss_fun, acc_fun = optim_utils.loss_and_accuracy( apply_fun, config['model'], config['optim']) _, initial_params = init_fun( prng_key, (config['data']['batch_size'], config['data']['max_pad'])) initial_params = model_utils.initialize(initial_params, config['model']) # get optimizer opt, get_params, opt_state, step_fun = optim_utils.optimization_suite( initial_params, loss_fun, config['optim']) ## Scope setup # Reporter setup data_store = {} reporter = reporters.build_reporters(config['save'], data_store) # Static state for scope static_state = { 'acc_fun': acc_fun, 'loss_fun': loss_fun, 'param_extractor': get_params, 'test_set': test_dset } oscilloscope = m.MetricCallback(static_state) def interval_trigger(interval): def function_to_return(x): return x % interval == 0 return function_to_return oscilloscope.add_measurement({ 'name': 'test_acc', 'trigger': interval_trigger(config['save']['measure_test']), 'function': measurements.measure_test_acc }) oscilloscope.add_measurement({ 'name': 'shuffled_test_acc', 'trigger': interval_trigger(config['save']['measure_test']), 'function': measurements.measure_shuffled_acc }) oscilloscope.add_measurement({ 'name': 'train_acc', 'trigger': interval_trigger(config['save']['measure_train']), 'function': measurements.measure_batch_acc }) oscilloscope.add_measurement({ 'name': 'train_loss', 'trigger': interval_trigger(config['save']['measure_train']), 'function': measurements.measure_batch_loss }) oscilloscope.add_measurement({ 'name': 'l2_norm', 'trigger': interval_trigger(config['save']['measure_test']), 'function': measurements.measure_l2_norm }) # Train global_step = 0 loss = np.nan for epoch in range(config['optim']['num_epochs']): for batch_num, batch in enumerate(tfds.as_numpy(train_dset)): dynamic_state = { 'opt_state': opt_state, 'batch_train_loss': loss, 'batch': batch } step_measurements = oscilloscope.measure( int(global_step), dynamic_state) if step_measurements is not None: reporter.report_all(int(global_step), step_measurements) global_step, opt_state, loss = step_fun( global_step, opt_state, batch) if global_step % config['save']['checkpoint_interval'] == 0: params = get_params(opt_state) np_params = np.asarray(params, dtype=object) reporters.save_dict(config, np_params, f'checkpoint_{global_step}') final_measurements = oscilloscope.measure( int(global_step), dynamic_state, measurement_list=['test_acc', 'shuffled_test_acc']) reporter.report_all(int(global_step), final_measurements) final_params = { 'params': np.asarray(get_params(opt_state), dtype=object) } reporters.save_dict(config, final_params, 'final_params')