示例#1
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = self.dataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        stdout_catcher = StdoutCatcher()
        with stdout_catcher:
            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X_train_valid_preprocessed,
                y_train_valid,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                worker_log_level=logging.INFO,
                run_kwargs=run.run_kwargs,
            )
            # Sleep a few seconds to let logs from the worker catch up
            time.sleep(3)

        # Sample the observations if there are more than 1,000 in the test set, since we
        # need to save the chart, and trying to save large charts can cause Selenium timeouts
        # when they're rendered to PNG
        sample_size = 1000
        chart = results.plot(sample_size=sample_size).properties(
            title=
            f"Predicted Probability (Sampled Test Set Observations, n={sample_size})"
        )
        plot_path = run_output_dir / "plot.png"
        # Longer driver timeout needed since these images can be very big
        chart.save(str(plot_path), driver_timeout=600)

        md = f"# Results: {run.key}\n"
        md += f"```\n{stdout_catcher.get_logs()}\n```\n"
        md += tabulate(pd.DataFrame(results.training_results),
                       tablefmt="pipe",
                       headers="keys")
        md += f"\n```\n{results.metrics_report()}\n```\n"
        md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
示例#2
0
    def _validate_params(self):
        assert_param_required("percent_multipliers", self.params)
        percent_multipliers = self.params["percent_multipliers"]
        assert_type("percent_multipliers", percent_multipliers, list)
        for (p, m) in percent_multipliers:
            assert_type("percent", p, float)
            assert_proportion("percent", p)
            assert_type("multiplier", m, (int, float))

        assert_type("param_grid", self.params.get("param_grid", {}), dict)

        assert_param_required("model_name", self.params)
        assert_type("model_name", self.params["model_name"], str)
        assert_valid_model(self.params["model_name"])

        assert_param_required("augment_probability", self.params)
        assert_type("augment_probability", p, float)
        assert_proportion("augment_probability", p)

        assert_param_required("preprocess_func", self.params)
        assert_in("preprocess_func", self.params["preprocess_func"],
                  PREPROCESS_FUNCS)
示例#3
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        for window_len, pooling in self.params["window_len_poolings"]:

            if window_len is not None and pooling is not None:
                with tempfile.TemporaryDirectory() as tmpdir:
                    tokenizer_path = Path(tmpdir) / "tokenizer"

                    X_windowed, _, y_windowed = make_document_windows(
                        X_train_valid_preprocessed,
                        window_len=window_len,
                        y=y_train_valid,
                        tokenize_method=TokenizeMethod.SENTENCEPIECE,
                        vocab_size=self.params["vocab_size"],
                        model_path=tokenizer_path,
                    )
                    (
                        X_test_windowed,
                        X_test_windowed_indices,
                        y_test_windowed,
                    ) = make_document_windows(
                        X_test_preprocessed,
                        window_len=window_len,
                        y=y_test,
                        tokenize_method=TokenizeMethod.SENTENCEPIECE,
                        vocab_size=self.params["vocab_size"],
                        model_path=tokenizer_path,
                    )
            else:
                X_windowed, y_windowed = X_train_valid_preprocessed, y_train_valid
                X_test_windowed, y_test_windowed = X_test_preprocessed, y_test

            print(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating window: Length {window_len}, pooling {pooling} ({len(X_windowed)} obs)"
            )
            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X_windowed,
                y_windowed,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_windowed, y_test_windowed),
                run_kwargs=run.run_kwargs,
            )

            if window_len is not None:
                pooled_output = PredictOutput(
                    y_pred_proba=results.y_pred_proba.copy())

                pool_document_windows(
                    pooled_output,
                    X_test_windowed_indices,
                    pooling=WindowPooling(pooling),
                )

            all_results.append(results.metrics())

        all_metrics = pd.DataFrame([{
            "Window Config": f"Length {window_len}, pooling {pooling}",
            **r
        } for w, r in zip(self.params["window_len_poolings"], all_results)])

        fig = plt.figure(figsize=(10, 10))

        acc_ax = fig.add_subplot()
        all_metrics.plot(x="Window Config",
                         y="Accuracy",
                         ax=acc_ax,
                         kind="bar")

        plt.xlabel("Document Windowing")
        plt.title(
            f"Model Performance by Document Windowing - {model_cls.__name__}")
        plt.ylim(0, 1)

        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
示例#4
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        # Finish linting, test
        for proportion in self.params["data_proportions"]:
            X_sampled, _, y_sampled, _ = train_test_split(
                X_train_valid_preprocessed,
                y_train_valid,
                train_size=proportion,
                random_state=1,
            )
            LOGGER.info(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating proportion {round(proportion, 3)} ({len(X_sampled)} obs)"
            )
            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X_sampled,
                y_sampled,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                run_kwargs=run.run_kwargs,
            )
            all_results.append(results)

        all_metrics = pd.DataFrame([{
            "data_proportion":
            p,
            "num_documents":
            int(p * len(X_train_valid)),
            **r.metrics(),
        } for p, r in zip(self.params["data_proportions"], all_results)])

        fig = plt.figure(figsize=(10, 10))
        f1_ax = fig.add_subplot()
        all_metrics.plot(x="num_documents", y="Weighted F1 Score", ax=f1_ax)

        acc_ax = fig.add_subplot()
        all_metrics.plot(x="num_documents", y="Accuracy", ax=acc_ax)

        plt.xlabel("Number of Documents Used for Training/Validation")
        plt.title(
            f"Model Performance by Number of Documents Used for Training/Validation - {model_cls.__name__}"
        )
        plt.xlim(0, int(all_metrics["num_documents"].max() * 1.1))
        plt.ylim(0, 1)
        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
示例#5
0
    def _do_run(self, run: ModelClassificationRun,
                run_output_dir: Path) -> str:
        ds = IMDBDataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_train_valid_preprocessed = preprocess_func(X_train_valid)
        X_test_preprocessed = preprocess_func(X_test)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        all_results = []

        majority, minority = ClassImbalanceScenario.find_majority_minority_classes(
            y_test)
        majority_df, minority_df = ClassImbalanceScenario.split_dataset(
            X_train_valid_preprocessed, y_train_valid, majority, minority)

        for proportion in self.params["imbalance_proportions"]:
            # Downsample the minority class so the final dataset contains the desired
            # proportion of the minority
            orig_len = majority_df.shape[0]
            downsample_proportion = -orig_len / (orig_len -
                                                 orig_len / proportion)
            minority_sample = minority_df.sample(
                frac=downsample_proportion).reset_index()
            sampled_df = pd.concat([majority_df, minority_sample])

            X = sampled_df["X"].tolist()
            y = sampled_df["y"].tolist()

            LOGGER.info(
                f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} "
                f"Evaluating proportion {round(proportion, 3)} ({len(X)} obs)")

            results = run_benchmark_experiment(
                f"{self.name}_{run.key}",
                X,
                y,
                model_cls,
                run.param_grid,
                test_dataset=(X_test_preprocessed, y_test),
                run_kwargs=run.run_kwargs,
            )
            all_results.append(results)

        minority_f1_scores = []
        majority_f1_scores = []
        for result in all_results:
            majority_f1, minority_f1 = f1_score(
                result.y_true,
                pred_prob_to_pred_label(result.y_pred_proba),
                average=None,
                labels=[majority, minority],
            )
            minority_f1_scores.append(minority_f1)
            majority_f1_scores.append(majority_f1)

        all_metrics = pd.DataFrame([{
            "imbalance_proportion": p,
            **r.metrics()
        } for p, r in zip(self.params["imbalance_proportions"], all_results)])

        all_metrics["Minority Class F1 Score"] = minority_f1_scores
        all_metrics["Majority Class F1 Score"] = majority_f1_scores

        fig = plt.figure(figsize=(10, 10))
        minority_ax = fig.add_subplot()
        all_metrics.plot(x="imbalance_proportion",
                         y="Minority Class F1 Score",
                         ax=minority_ax)

        majority_ax = fig.add_subplot()
        all_metrics.plot(x="imbalance_proportion",
                         y="Majority Class F1 Score",
                         ax=majority_ax)

        plt.xlabel("Prevalence of Minority Class")
        plt.title(
            f"Model Performance by Prevalence of Minority Class - {model_cls.__name__}"
        )
        plt.xlim(0, 0.5)
        plt.ylim(0, 1)

        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += tabulate(all_metrics, tablefmt="pipe", headers="keys")
        md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md
示例#6
0
    def _do_run(self, run: ModelEmbeddingRun, run_output_dir: Path) -> str:
        ds = self.dataset.load()
        X_train_valid, y_train_valid, X_test, y_test = maybe_limit(
            ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(),
            self.dataset_limit)
        X_embed = X_train_valid + X_test
        labels = y_train_valid + y_test

        assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS)
        preprocess_func = PREPROCESS_FUNCS[run.preprocess_func]
        X_embed_preprocessed = preprocess_func(X_embed)

        assert_valid_model(run.model_name)
        model_cls = getattr(gobbli.model, run.model_name)

        stdout_catcher = StdoutCatcher()
        with stdout_catcher:
            # Construct the dict of kwargs up-front so each run can override the "use_gpu"
            # option if necessary using its model params -- ex. for models like spaCy
            # which have trouble controlling memory usage on the GPU and don't gain
            # much benefit from it
            model_kwargs = {**get_model_run_params(), **run.model_params}
            model = model_cls(**model_kwargs)
            model.build()

            embed_input = EmbedInput(X=X_embed_preprocessed,
                                     embed_batch_size=run.batch_size)
            embed_output = model.embed(embed_input)

        X_embedded = pd.DataFrame(embed_output.X_embedded)
        umap = UMAP(random_state=1)
        umap_data = umap.fit_transform(X_embedded)
        umap_df = pd.DataFrame(
            umap_data, columns=["UMAP Component 1", "UMAP Component 2"])
        umap_df["Label"] = labels
        groups = umap_df.groupby("Label")

        fig = plt.figure(figsize=(15, 15))
        ax = fig.add_subplot()
        cmap = plt.cm.get_cmap("tab20")

        for (name, group), c in zip(groups, cmap.colors):
            ax.plot(
                group["UMAP Component 1"],
                group["UMAP Component 2"],
                marker="o",
                linestyle="",
                ms=6,
                label=name,
                color=c,
                alpha=0.5,
            )
        ax.legend()
        ax.axis("off")
        ax.set_title(f"Embeddings by Label - {run.key}")
        plot_path = run_output_dir / "plot.png"
        fig.savefig(plot_path)

        md = f"# Results: {run.key}\n"
        md += f"```\n{stdout_catcher.get_logs()}\n```\n"
        md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---"

        return md