Пример #1
0
def predict_tags(
    text: str = "Transfer learning with BERT for self-supervised learning",
    run_id: str = "",
) -> Dict:
    """Predict tags for a give input text using a trained model.
    Warning:
        Make sure that you have a trained model first!
    Args:
        text (str, optional): Input text to predict tags for.
                              Defaults to "Transfer learning with BERT for self-supervised learning".
        run_id (str, optional): ID of the run to load model artifacts from.
                                Defaults to model with lowest `best_val_loss` from the `best` experiment.
    Returns:
        Predicted tags for input text.
    """
    # Get best run
    if not run_id:
        experiment_id = mlflow.get_experiment_by_name("best").experiment_id
        all_runs = mlflow.search_runs(
            experiment_ids=experiment_id,
            order_by=["metrics.best_val_loss ASC"],
        )
        run_id = all_runs.iloc[0].run_id

    # Predict
    artifacts = predict.load_artifacts(run_id=run_id)
    prediction = predict.predict(texts=[text], artifacts=artifacts)
    logger.info(json.dumps(prediction, indent=2))

    return prediction
Пример #2
0
def health_check():
    response = requests.get(
        f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/",
        headers=headers,
    )
    results = json.loads(response.text)
    logger.info(json.dumps(results, indent=4))
Пример #3
0
def set_artifact_metadata():
    """Set the artifact URI for all experiments and runs.
    Used when transferring experiments from other locations (ex. Colab).
    Note:
        check out the [optimize.ipynb](https://colab.research.google.com/github/GokuMohandas/applied-ml/blob/main/notebooks/optimize.ipynb){:target="_blank"} notebook for how to train on Google Colab and transfer to local.
    """
    def set_artifact_location(var, fp):
        """Set variable's yaml value on file at fp."""
        with open(fp) as f:
            metadata = yaml.load(f)

        # Set new value
        experiment_id = metadata[var].split("/")[-1]
        artifact_location = Path("file://", config.EXPERIMENTS_DIR,
                                 experiment_id)
        metadata[var] = str(artifact_location)

        with open(fp, "w") as f:
            yaml.dump(metadata, f)

    def set_artifact_uri(var, fp):
        """Set variable's yaml value on file at fp."""
        with open(fp) as f:
            metadata = yaml.load(f)

        # Set new value
        experiment_id = metadata[var].split("/")[-3]
        run_id = metadata[var].split("/")[-2]
        artifact_uri = Path(
            "file://",
            config.EXPERIMENTS_DIR,
            experiment_id,
            run_id,
            "artifacts",
        )
        metadata[var] = str(artifact_uri)

        with open(fp, "w") as f:
            yaml.dump(metadata, f)

    # Get artifact location
    experiment_meta_yamls = list(
        Path(config.EXPERIMENTS_DIR).glob("*/meta.yaml"))
    for meta_yaml in experiment_meta_yamls:
        set_artifact_location(var="artifact_location", fp=meta_yaml)
        logger.info(f"Set artfifact location for {meta_yaml}")

    # Change artifact URI
    run_meta_yamls = list(Path(config.EXPERIMENTS_DIR).glob("*/*/meta.yaml"))
    for meta_yaml in run_meta_yamls:
        set_artifact_uri(var="artifact_uri", fp=meta_yaml)
        logger.info(f"Set artfifact URI for {meta_yaml}")
Пример #4
0
def download_data():
    """Download data from online to local drive.
    Note:
        We could've just copied files from `datasets` but
        we'll use this later on with other data sources.
    """
    # Download data
    projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json"
    tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json"
    projects = utils.load_json_from_url(url=projects_url)
    tags = utils.load_json_from_url(url=tags_url)

    # Save data
    projects_fp = Path(config.DATA_DIR, "projects.json")
    tags_fp = Path(config.DATA_DIR, "tags.json")
    utils.save_dict(d=projects, filepath=projects_fp)
    utils.save_dict(d=tags, filepath=tags_fp)
    logger.info("✅ Data downloaded!")
Пример #5
0
def clean_experiments(experiments_to_keep: str = "best"):
    """Removes all experiments besides the
    ones specified in `experiments_to_keep`.
    Args:
        experiments_to_keep (str): comma separated string of experiments to keep.
    """
    # Get experiments to keep
    experiments_to_keep = list(
        set([exp.strip() for exp in experiments_to_keep.split(",")]))
    if not len(experiments_to_keep):
        raise ValueError("You must keep at least one experiment.")

    # Filter and delete
    client = mlflow.tracking.MlflowClient()
    for experiment in client.list_experiments():
        if experiment.name not in experiments_to_keep:
            logger.info(f"Deleting Experiment {experiment.name}")
            client.delete_experiment(experiment_id=experiment.experiment_id)

    # Delete MLFlow trash
    shutil.rmtree(Path(config.EXPERIMENTS_DIR, ".trash"))
    logger.info(f"Cleared experiments besides {experiments_to_keep}")
Пример #6
0
def predict():
    data = {
        "run_id":
        "",
        "texts": [
            {
                "text":
                "Transfer learning with transformers for self-supervised learning."
            },
            {
                "text":
                "Generative adversarial networks in both PyTorch and TensorFlow."
            },
        ],
    }
    response = requests.post(
        f"http://0.0.0.0:{os.environ.get('PORT', 5000)}/predict",
        headers=headers,
        data=json.dumps(data),
    )
    results = json.loads(response.text)
    logger.info(json.dumps(results, indent=4))
Пример #7
0
def train_model(args_fp: Path = Path(config.CONFIG_DIR, "args.json")) -> None:
    """Train a model using the specified parameters.
    Args:
        args_fp (Path, optional): Location of arguments to use for training.
                                  Defaults to `config/args.json`.
    """
    # Set experiment and start run
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Start run
    mlflow.set_experiment(experiment_name="best")
    with mlflow.start_run(
            run_name="cnn") as run:  # NOQA: F841 (assigned to but never used)
        # Train
        artifacts = train.run(args=args)

        # Log key metrics
        performance = artifacts["performance"]
        loss = artifacts["loss"]
        mlflow.log_metrics({"precision": performance["overall"]["precision"]})
        mlflow.log_metrics({"recall": performance["overall"]["recall"]})
        mlflow.log_metrics({"f1": performance["overall"]["f1"]})
        mlflow.log_metrics({"best_val_loss": loss})

        # Log artifacts
        args = artifacts["args"]
        model = artifacts["model"]
        label_encoder = artifacts["label_encoder"]
        tokenizer = artifacts["tokenizer"]
        with tempfile.TemporaryDirectory() as fp:
            label_encoder.save(Path(fp, "label_encoder.json"))
            tokenizer.save(Path(fp, "tokenizer.json"))
            torch.save(model.state_dict(), Path(fp, "model.pt"))
            utils.save_dict(performance, Path(fp, "performance.json"))
            mlflow.log_artifacts(fp)
        mlflow.log_params(vars(args))
    logger.info(json.dumps(performance["overall"], indent=2))
Пример #8
0
def optimize(num_trials: int = 100) -> None:
    """Optimize a subset of hyperparameters towards an objective.
    This saves the best trial's arguments into `config/args.json`.
    Args:
        num_trials (int, optional): Number of trials to run. Defaults to 100.
    """
    # Starting arguments (not actually used but needed for set up)
    args_fp = Path(config.CONFIG_DIR, "args.json")
    args = Namespace(**utils.load_dict(filepath=args_fp))

    # Optimize
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
    study = optuna.create_study(study_name="optimization",
                                direction="maximize",
                                pruner=pruner)
    mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(),
                                     metric_name="f1")
    study.optimize(
        lambda trial: train.objective(args, trial),
        n_trials=num_trials,
        callbacks=[mlflow_callback],
    )

    # All trials
    trials_df = study.trials_dataframe()
    trials_df = trials_df.sort_values(["value"],
                                      ascending=False)  # sort by metric
    trials_df.to_csv(Path(config.EXPERIMENTS_DIR, "trials.csv"),
                     index=False)  # save

    # Best trial
    logger.info(f"Best value (f1): {study.best_trial.value}")
    params = {**args.__dict__, **study.best_trial.params}
    params["threshold"] = study.best_trial.user_attrs["threshold"]
    with open(Path(config.CONFIG_DIR, "args.json"), "w") as fp:
        json.dump(params, fp=fp, indent=2, cls=NumpyEncoder)
    logger.info(json.dumps(params, indent=2, cls=NumpyEncoder))