def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=2, validation_freq=1, batches_per_step=1) yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads_2() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1) yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=3, validation_freq=3, scheduling_unit=10) training_metrics = trainer.get_avg_training_metrics() _, validation_metrics = trainer.result() batch_size = self.hparams["global_batch_size"] for i, metrics in enumerate(training_metrics): expect = pytorch_onevar_model.TriangleLabelSum.expect( batch_size, 10 * i, 10 * (i + 1)) assert "cls_reducer" in metrics assert metrics["cls_reducer"] == expect assert "fn_reducer" in metrics assert metrics["fn_reducer"] == expect for metrics in validation_metrics: num_batches = len( pytorch_onevar_model.OnesDataset()) // batch_size expect = pytorch_onevar_model.TriangleLabelSum.expect( batch_size, 0, num_batches) assert "cls_reducer" in metrics assert metrics["cls_reducer"] == expect assert "fn_reducer" in metrics assert metrics["fn_reducer"] == expect yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=1, batches_per_step=100) training_metrics, validation_metrics = trainer.result() # We expect the validation error and training loss to be # monotonically decreasing. # TODO(DET-1597): actually use a model and optimizer where the losses # are monotonically decreasing. for older, newer in zip(training_metrics[::100], training_metrics[::100][1:]): assert newer["loss"] <= older["loss"] for older, newer in zip(validation_metrics, validation_metrics[1:]): assert newer["val_categorical_error"] <= older[ "val_categorical_error"] assert validation_metrics[-1][ "val_categorical_error"] == pytest.approx(0.0) yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, scheduling_unit=1) yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=1, scheduling_unit=100) training_metrics, validation_metrics = trainer.result() # We expect the validation error and training loss to be # monotonically decreasing. # TODO(DET-1597): actually use a model and optimizer where the losses # are monotonically decreasing. for older, newer in zip(training_metrics[::100], training_metrics[::100][1:]): assert newer["loss"] <= older["loss"] for older, newer in zip(validation_metrics, validation_metrics[1:]): assert newer["val_categorical_error"] <= older[ "val_categorical_error"] epsilon = 0.0001 assert abs( validation_metrics[-1]["val_categorical_error"]) < epsilon
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate( request_stop_step_id=request_stop_step_id) yield from trainer.send(steps=2, validation_freq=2, scheduling_unit=5) tm, vm = trainer.result()
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=15, validation_freq=4, scheduling_unit=5) training_metrics, validation_metrics = trainer.result()
def make_workloads_2() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=1, validation_freq=1, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, )
def make_workloads(tag: str) -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1000, validation_freq=100) tm, vm = trainer.result() training_metrics[tag] = tm validation_metrics[tag] = vm
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, train_batch_calls=1) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert "loss" in metrics
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10) training_metrics, validation_metrics = trainer.result() for metrics in training_metrics: assert "accuracy" in metrics
def make_workloads(checkpoint_dir: pathlib.Path) -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=5, batches_per_step=5) yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads_1() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1) interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] steps_completed = trainer.get_steps_completed()
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate(request_stop_step_id=1) yield from trainer.send(steps=100, validation_freq=2, scheduling_unit=5) tm, vm = trainer.result() yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads(tag: str) -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=900, validation_freq=100) tm, vm = trainer.result() training_metrics[tag] = tm validation_metrics[tag] = vm yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert "binary_error" in metrics assert "predictions" in metrics
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate( request_stop_step_id=request_stop_step_id) yield from trainer.send(steps=2, validation_freq=2, batches_per_step=5) tm, vm = trainer.result() yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, scheduling_unit=10) interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint latest_checkpoint = interceptor.metrics_result()["uuid"]
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10) training_metrics, validation_metrics = trainer.result() for metrics in training_metrics: assert "categorical_accuracy" in metrics assert "predictions" in metrics yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1000, validation_freq=100) training_metrics, validation_metrics = trainer.result() # We expect the validation error and training loss to be # monotonically decreasing. for older, newer in zip(training_metrics, training_metrics[1:]): assert newer["loss"] <= older["loss"] yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=10, validation_freq=10, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert "loss" in metrics
def make_workloads_1() -> workload.Stream: nonlocal old_loss trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10) training_metrics, validation_metrics = trainer.result() old_loss = validation_metrics[-1]["val_loss"] interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] steps_completed = trainer.get_steps_completed()
def make_workloads_1() -> workload.Stream: nonlocal old_loss trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10) training_metrics, validation_metrics = trainer.result() old_loss = validation_metrics[-1]["val_loss"] yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=100, validation_freq=10) training_metrics, validation_metrics = trainer.result() # Check the gradient update at every step. for idx, batch_metrics in enumerate(training_metrics): pytorch_onevar_model.OneVarTrial.check_batch_metrics( batch_metrics, idx) # We expect the validation error and training loss to be # monotonically decreasing. for older, newer in zip(training_metrics, training_metrics[1:]): assert newer["loss"] <= older["loss"]
def make_workloads(checkpoint_dir: str = "") -> workload.Stream: nonlocal training_metrics trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10, batches_per_step=1) tm, _ = trainer.result() training_metrics += tm if checkpoint_dir: yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() # Test >1 validation to ensure that resetting the allgather_op list is working. yield from trainer.send(steps=2, validation_freq=1, scheduling_unit=1) training_metrics, validation_metrics = trainer.result() label_sum = estimator_linear_model.validation_label_sum() for metrics in validation_metrics: assert metrics["label_sum_tensor_fn"] == label_sum assert metrics["label_sum_tensor_cls"] == label_sum assert metrics["label_sum_list_fn"] == 2 * label_sum assert metrics["label_sum_list_cls"] == 2 * label_sum assert metrics["label_sum_dict_fn"] == 2 * label_sum assert metrics["label_sum_dict_cls"] == 2 * label_sum
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=5, batches_per_step=1000) training_metrics, validation_metrics = trainer.result() # We expect the training loss to be monotonically decreasing and the # accuracy to be monotonically increasing. for older, newer in zip(training_metrics, training_metrics[1:]): assert newer["loss"] < older["loss"] for older, newer in zip(validation_metrics, validation_metrics[1:]): assert newer["accuracy"] >= older["accuracy"] # The final accuracy should be 100%. assert validation_metrics[-1]["accuracy"] == pytest.approx(1.0) yield workload.terminate_workload(), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() # Test >1 validation to ensure that resetting the allgather_op list is working. yield from trainer.send(steps=2, validation_freq=1, batches_per_step=1) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert metrics[ "label_sum_fn"] == estimator_linear_model.validation_label_sum( ) assert metrics[ "label_sum_cls"] == estimator_linear_model.validation_label_sum( ) yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, scheduling_unit=1)