Пример #1
0
    def test_end_to_end(self):
        with open("keepsake.yaml", "w") as f:
            f.write('repository: "file://.keepsake"')

        with open("foo.txt", "w") as f:
            f.write("foo")
        with open("bar.txt", "w") as f:
            f.write("bar")

        experiment = keepsake.init(path=".",
                                   params={
                                       "myint": 10,
                                       "myfloat": 0.1
                                   })

        with open("bar.txt", "w") as f:
            f.write("barrrr")

        experiment.checkpoint(path="bar.txt", metrics={"value": 123.45})

        experiment = keepsake.experiments.get(experiment.id)
        self.assertEqual(10, experiment.params["myint"])
        self.assertEqual(0.1, experiment.params["myfloat"])
        self.assertEqual(123.45, experiment.checkpoints[0].metrics["value"])

        foo = experiment.checkpoints[0].open("foo.txt")
        self.assertEqual("foo", foo.read().decode("utf-8"))
        bar = experiment.checkpoints[0].open("bar.txt")
        self.assertEqual("barrrr", bar.read().decode("utf-8"))

        with self.assertRaises(ImportError):
            experiment.plot("value")
Пример #2
0
    def test_real_example(self) -> None:
        experiment = keepsake.init()
        checkpoints = [
            (10000, 0.42, 1.34),
            (20000, 0.56, 0.17),
            (30000, 0.59363, 0.10),
            (40000, 0.58, 0.076),
            (50000, 0.61, 0.06),
            (60000, 0.61, 0.04),
            (70000, 0.61, 0.04),
            (80000, 0.61, 0.03),
            (90000, 0.62, 0.02),
            (100000, 0.61, 0.02),
        ]
        for step, eval_score, train_score in checkpoints:
            experiment.checkpoint(
                step=step,
                metrics={"eval": eval_score, "train": train_score},
                primary_metric=("eval", "maximize"),
            )

        self.assertEqual(
            train_model.checkpoints_to_delete(experiment),
            [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],
        )
        experiment.delete()
Пример #3
0
 def test_only_one_checkpoint(self) -> None:
     experiment = keepsake.init(params={})
     experiment.checkpoint(
         step=100,
         metrics={"eval_execution_f1": 0.7},
         primary_metric=("eval_execution_f1", "maximize"),
     )
     self.assertEqual(train_model.checkpoints_to_delete(experiment), [])
     experiment.delete()
Пример #4
0
def test_project_repository_version(temp_workdir):
    with open("keepsake.yaml", "w") as f:
        f.write("repository: file://.keepsake")
    experiment = keepsake.init()

    expected = """{"version":1}"""
    with open(".keepsake/repository.json") as f:
        assert f.read() == expected

    # no error on second init
    experiment = keepsake.init()
    with open(".keepsake/repository.json") as f:
        # repository.json shouldn't have changed
        assert f.read() == expected

    with open(".keepsake/repository.json", "w") as f:
        f.write("""{"version":2}""")
    with pytest.raises(IncompatibleRepositoryVersion):
        keepsake.init()
Пример #5
0
def test_is_running(temp_workdir):
    with open("keepsake.yaml", "w") as f:
        f.write("repository: file://.keepsake/")

    experiment = keepsake.init()

    heartbeat_path = f".keepsake/metadata/heartbeats/{experiment.id}.json"

    assert wait(lambda: os.path.exists(heartbeat_path),
                timeout_seconds=10,
                sleep_seconds=0.01)

    # Check whether experiment is running after heartbeats are started
    assert experiment.is_running()

    # Heartbeats stopped
    experiment.stop()
    assert not experiment.is_running()
Пример #6
0
    def test_integration(self) -> None:
        experiment = keepsake.init()
        checkpoints = [
            (10000, 0.42, 1.34),
            (20000, 0.56, 0.17),
            (30000, 0.59363, 0.10),
            (40000, 0.58, 0.076),
            (50000, 0.61, 0.06),
            (60000, 0.61, 0.04),
            (70000, 0.61, 0.04),
            (80000, 0.61, 0.03),
            (90000, 0.62, 0.02),
            (100000, 0.61, 0.02),
        ]
        for step, eval_score, train_score in checkpoints:
            experiment.checkpoint(
                step=step,
                metrics={"eval": eval_score, "train": train_score},
                primary_metric=("eval", "maximize"),
            )

        self.assertEqual(
            train_model.checkpoints_to_delete(experiment),
            [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            self._make_checkpoint_helper(tmpdir, 10000)
            self._make_checkpoint_helper(tmpdir, 20000)
            self._make_checkpoint_helper(tmpdir, 30000)
            self._make_checkpoint_helper(tmpdir, 40000)
            self._make_checkpoint_helper(tmpdir, 50000)
            self._make_checkpoint_helper(tmpdir, 60000)
            self._make_checkpoint_helper(tmpdir, 70000)
            self._make_checkpoint_helper(tmpdir, 80000)
            files = self._make_checkpoint_helper(tmpdir, 90000)
            files += self._make_checkpoint_helper(tmpdir, 100000)

            for step in train_model.checkpoints_to_delete(experiment):
                train_model.delete_checkpoint(tmpdir, step)

            self.assertEqual(set(os.listdir(tmpdir)), set(files))

        experiment.delete()
Пример #7
0
def train(learning_rate, num_epochs):
    # highlight-start
    # Create an "experiment". This represents a run of your training script.
    # It saves the training code at the given path and any hyperparameters.
    experiment = keepsake.init(
        path=".",
        # highlight-start
        params={
            "learning_rate": learning_rate,
            "num_epochs": num_epochs
        },
    )
    # highlight-end

    print("Downloading data set...")
    iris = load_iris()
    train_features, val_features, train_labels, val_labels = train_test_split(
        iris.data,
        iris.target,
        train_size=0.8,
        test_size=0.2,
        random_state=0,
        stratify=iris.target,
    )
    train_features = torch.FloatTensor(train_features)
    val_features = torch.FloatTensor(val_features)
    train_labels = torch.LongTensor(train_labels)
    val_labels = torch.LongTensor(val_labels)

    torch.manual_seed(0)
    model = nn.Sequential(
        nn.Linear(4, 15),
        nn.ReLU(),
        nn.Linear(15, 3),
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(train_features)
        loss = criterion(outputs, train_labels)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            model.eval()
            output = model(val_features)
            acc = (output.argmax(1)
                   == val_labels).float().sum() / len(val_labels)

        print(
            "Epoch {}, train loss: {:.3f}, validation accuracy: {:.3f}".format(
                epoch, loss.item(), acc))
        torch.save(model, "model.pth")
        # highlight-start
        # Create a checkpoint within the experiment.
        # This saves the metrics at that point, and makes a copy of the file
        # or directory given, which could weights and any other artifacts.
        experiment.checkpoint(
            path="model.pth",
            step=epoch,
            metrics={
                "loss": loss.item(),
                "accuracy": acc
            },
            primary_metric=("loss", "minimize"),
        )
Пример #8
0
def main(unused_argv: Any) -> None:
    tf.logging.info("Saving model saves and results to " + FLAGS.model_dir)

    global_seed(42)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train`, `do_eval` must be True.")

    config = model_config.load_config(FLAGS.config)

    if FLAGS.do_train:
        tf.logging.info("Training with train filenames: " +
                        str(FLAGS.training_filename))

    # Training allows noisy examples so do not use clean output vocab
    model_fn = model_builder.build_model_fn(config,
                                            FLAGS.output_vocab_filepath,
                                            clean_output_vocab_path="")

    # region training
    if FLAGS.do_train:
        # for keepsake CLI (helps track experiment results)
        experiment = keepsake.init(params={
            "learning_rate": config.training_options.optimizer_learning_rate,
            "batch_size": config.training_options.batch_size,
            "training_steps": config.training_options.training_steps,
            "eval_batch_size": FLAGS.eval_batch_size,
            "training_data": FLAGS.training_filename,
            "eval_data": FLAGS.eval_filename,
        }, )

        train_input_fn = input_pipeline.create_training_input_fn(
            config,
            FLAGS.tf_examples_dir,
            [name for name in FLAGS.training_filename if name],
        )

        train_features, train_labels = train_input_fn()
        train_model = model_fn(train_features, train_labels,
                               tf.estimator.ModeKeys.TRAIN)

        tf.get_variable_scope().reuse_variables()

        inference_config = inference.Config(
            FLAGS.eval_dataset_name,
            FLAGS.eval_splits.split(","),
            FLAGS.output_vocab_filepath,
            FLAGS.clean_output_vocab_filepath,
            FLAGS.eval_beam_size,
            FLAGS.using_abstract_sql,
            FLAGS.database_directory,
            FLAGS.empty_database_directory,
            FLAGS.original_data_directory,
            model_config.load_config(FLAGS.config),
        )

        saver = tf.train.Saver(max_to_keep=None)

        global_step = 0
        checkpoint = checkpoint_path(FLAGS.model_dir, global_step)

        validation_query_cache: Dict[str, Any] = {}

        with tf.Session() as init_sess:
            init_sess.run(tf.global_variables_initializer())
            saver.save(init_sess, checkpoint)

        while global_step < config.training_options.training_steps:
            # region training loop
            with tf.Session() as train_sess:
                tf.logging.info(
                    "Training from step %s to step %s",
                    global_step,
                    global_step + FLAGS.steps_between_saves,
                )
                saver.restore(train_sess, checkpoint)

                train_losses = []

                for step in range(FLAGS.steps_between_saves):
                    _, train_loss = train_sess.run(
                        [train_model.train_op, train_model.loss])

                    train_losses.append(train_loss)

                    if step % 100 == 0:
                        tf.logging.info(
                            "Step %s's training loss: %s",
                            global_step + step,
                            train_loss,
                        )

                train_loss = statistics.mean(train_losses)

                global_step += FLAGS.steps_between_saves
                checkpoint = checkpoint_path(FLAGS.model_dir, global_step)
                saver.save(train_sess, checkpoint)
            # endregion

            # region eval loop
            tf.logging.info("Evaluating checkpoint %s", checkpoint)

            examples = inference.load_tf_examples(
                os.path.join(FLAGS.tf_examples_dir, FLAGS.eval_filename))
            random.shuffle(examples)

            tf.logging.info("Running inference on %s", FLAGS.eval_filename)
            predictions = inference.inference(
                examples,
                checkpoint,
                inference_config,
            )

            examples_to_execute = get_examples_to_execute(
                predictions, inference_config)

            # Only update cache when it's empty
            should_update_cache = len(validation_query_cache) == 0

            # only scholar is case sensitive
            case_sensitive = "scholar" not in FLAGS.eval_dataset_name.lower()

            results, validation_query_cache = official_evaluation.execute_predictions(
                instructions=examples_to_execute,
                cache_dict=validation_query_cache,
                case_sensitive=case_sensitive,
                verbose=False,
                update_cache=should_update_cache,
            )

            metrics = official_evaluation.aggregate_metrics(
                results, FLAGS.use_empty_tables)
            tf.logging.info("Validation Results:\n\tExecution F1: %s",
                            metrics.execution_f1)
            # endregion

            experiment.checkpoint(
                step=global_step,
                metrics={
                    "train_loss": train_loss,
                    "eval_execution_f1": metrics.execution_f1,
                    "eval_string_match": metrics.string_same,
                },
                primary_metric=("eval_execution_f1", "maximize"),
            )

            # region disk management

            for step in checkpoints_to_delete(experiment):
                assert (
                    step != global_step
                ), f"Can't delete step {step}; need it for next training epoch starting at step {global_step}"
                print(f"Deleting checkpoint {step}")
                delete_checkpoint(FLAGS.model_dir, step)
Пример #9
0
def test_init_and_checkpoint(temp_workdir):
    with open("keepsake.yaml", "w") as f:
        f.write("repository: file://.keepsake/")

    with open("train.py", "w") as fh:
        fh.write("print(1 + 1)")

    with open("README.md", "w") as fh:
        fh.write("Hello")

    # basic experiment
    experiment = keepsake.init(path=".",
                               params={"learning_rate": 0.002},
                               disable_heartbeat=True)

    experiment_tar_path = ".keepsake/experiments/{}.tar.gz".format(
        experiment.id)
    wait(
        lambda: os.path.exists(experiment_tar_path),
        timeout_seconds=5,
        sleep_seconds=0.01,
    )
    time.sleep(0.1)  # wait for file to be written

    assert len(experiment.id) == 64
    with open(".keepsake/metadata/experiments/{}.json".format(
            experiment.id)) as fh:
        metadata = json.load(fh)
    assert metadata["id"] == experiment.id
    assert metadata["params"] == {"learning_rate": 0.002}
    assert metadata["host"] == ""
    assert metadata["user"] != ""
    # FIXME: this is broken https://github.com/replicate/keepsake/issues/492
    assert metadata["config"]["repository"].startswith("file://")
    assert metadata["command"] != ""
    assert metadata["path"] == "."
    assert metadata["python_version"] != ""
    assert len(metadata["python_packages"]) > 0
    assert metadata["keepsake_version"] != ""

    with tempfile.TemporaryDirectory() as tmpdir:
        with tarfile.open(experiment_tar_path) as tar:
            tar.extractall(tmpdir)

        assert (open(os.path.join(tmpdir, experiment.id,
                                  "train.py")).read() == "print(1 + 1)")
        assert os.path.exists(os.path.join(tmpdir, experiment.id, "README.md"))

    # checkpoint with a file
    with open("weights", "w") as fh:
        fh.write("1.2kg")

    checkpoint = experiment.checkpoint(path="weights",
                                       step=1,
                                       metrics={"validation_loss": 0.123})

    checkpoint_tar_path = ".keepsake/checkpoints/{}.tar.gz".format(
        checkpoint.id)
    wait(
        lambda: os.path.exists(checkpoint_tar_path),
        timeout_seconds=5,
        sleep_seconds=0.01,
    )
    time.sleep(0.1)  # wait for file to be written

    assert len(checkpoint.id) == 64
    with open(".keepsake/metadata/experiments/{}.json".format(
            experiment.id)) as fh:
        metadata = json.load(fh)
    assert len(metadata["checkpoints"]) == 1
    checkpoint_metadata = metadata["checkpoints"][0]
    assert checkpoint_metadata["id"] == checkpoint.id
    assert checkpoint_metadata["step"] == 1
    assert checkpoint_metadata["metrics"] == {"validation_loss": 0.123}

    with tempfile.TemporaryDirectory() as tmpdir:
        with tarfile.open(checkpoint_tar_path) as tar:
            tar.extractall(tmpdir)

        assert open(os.path.join(tmpdir, checkpoint.id,
                                 "weights")).read() == "1.2kg"
        assert not os.path.exists(
            os.path.join(tmpdir, checkpoint.id, "train.py"))

    # checkpoint with a directory
    os.mkdir("data")
    with open("data/weights", "w") as fh:
        fh.write("1.3kg")

    checkpoint = experiment.checkpoint(path="data",
                                       step=1,
                                       metrics={"validation_loss": 0.123})

    checkpoint_tar_path = ".keepsake/checkpoints/{}.tar.gz".format(
        checkpoint.id)
    wait(
        lambda: os.path.exists(checkpoint_tar_path),
        timeout_seconds=5,
        sleep_seconds=0.01,
    )
    time.sleep(0.1)  # wait for file to be written

    with tempfile.TemporaryDirectory() as tmpdir:
        with tarfile.open(checkpoint_tar_path) as tar:
            tar.extractall(tmpdir)

        assert (open(os.path.join(tmpdir, checkpoint.id,
                                  "data/weights")).read() == "1.3kg")
        assert not os.path.exists(
            os.path.join(tmpdir, checkpoint.id, "train.py"))

    # checkpoint with no path
    checkpoint = experiment.checkpoint(path=None,
                                       step=1,
                                       metrics={"validation_loss": 0.123})

    # wait in case async process tries to create a path anyway
    time.sleep(0.5)

    with open(".keepsake/metadata/experiments/{}.json".format(
            experiment.id)) as fh:
        metadata = json.load(fh)
    assert metadata["checkpoints"][-1]["id"] == checkpoint.id
    assert not os.path.exists(".keepsake/checkpoints/{}.tar.gz".format(
        checkpoint.id))

    # experiment with file
    experiment = keepsake.init(path="train.py",
                               params={"learning_rate": 0.002},
                               disable_heartbeat=True)

    experiment_tar_path = ".keepsake/experiments/{}.tar.gz".format(
        experiment.id)
    wait(
        lambda: os.path.exists(experiment_tar_path),
        timeout_seconds=5,
        sleep_seconds=0.01,
    )
    time.sleep(0.1)  # wait for file to be written

    with tempfile.TemporaryDirectory() as tmpdir:
        with tarfile.open(experiment_tar_path) as tar:
            tar.extractall(tmpdir)

        assert (open(os.path.join(tmpdir, experiment.id,
                                  "train.py")).read() == "print(1 + 1)")
        assert not os.path.exists(
            os.path.join(tmpdir, experiment.id, "README.md"))

    # experiment with no path!
    experiment = keepsake.init(path=None,
                               params={"learning_rate": 0.002},
                               disable_heartbeat=True)

    # wait in case async process tries to create a path anyway
    time.sleep(0.5)

    with open(".keepsake/metadata/experiments/{}.json".format(
            experiment.id)) as fh:
        metadata = json.load(fh)
    assert metadata["id"] == experiment.id
    assert metadata["params"] == {"learning_rate": 0.002}
    assert not os.path.exists(".keepsake/experiments/{}.tar.gz".format(
        experiment.id))
Пример #10
0
def test_init_without_config_file(temp_workdir):
    with pytest.raises(ConfigNotFound):
        keepsake.init()
Пример #11
0
def test_init_with_config_file(temp_workdir):
    with open("keepsake.yaml", "w") as f:
        f.write("repository: file://.keepsake/")
    experiment = keepsake.init()
    assert isinstance(experiment, Experiment)
    experiment.stop()
Пример #12
0
 def on_pretrain_routine_start(self, trainer, pl_module):
     self.experiment = keepsake.init(path=".", params=self.params)