def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = self.dataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) stdout_catcher = StdoutCatcher() with stdout_catcher: results = run_benchmark_experiment( f"{self.name}_{run.key}", X_train_valid_preprocessed, y_train_valid, model_cls, run.param_grid, test_dataset=(X_test_preprocessed, y_test), worker_log_level=logging.INFO, run_kwargs=run.run_kwargs, ) # Sleep a few seconds to let logs from the worker catch up time.sleep(3) # Sample the observations if there are more than 1,000 in the test set, since we # need to save the chart, and trying to save large charts can cause Selenium timeouts # when they're rendered to PNG sample_size = 1000 chart = results.plot(sample_size=sample_size).properties( title= f"Predicted Probability (Sampled Test Set Observations, n={sample_size})" ) plot_path = run_output_dir / "plot.png" # Longer driver timeout needed since these images can be very big chart.save(str(plot_path), driver_timeout=600) md = f"# Results: {run.key}\n" md += f"```\n{stdout_catcher.get_logs()}\n```\n" md += tabulate(pd.DataFrame(results.training_results), tablefmt="pipe", headers="keys") md += f"\n```\n{results.metrics_report()}\n```\n" md += f"\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) all_results = [] for window_len, pooling in self.params["window_len_poolings"]: if window_len is not None and pooling is not None: with tempfile.TemporaryDirectory() as tmpdir: tokenizer_path = Path(tmpdir) / "tokenizer" X_windowed, _, y_windowed = make_document_windows( X_train_valid_preprocessed, window_len=window_len, y=y_train_valid, tokenize_method=TokenizeMethod.SENTENCEPIECE, vocab_size=self.params["vocab_size"], model_path=tokenizer_path, ) ( X_test_windowed, X_test_windowed_indices, y_test_windowed, ) = make_document_windows( X_test_preprocessed, window_len=window_len, y=y_test, tokenize_method=TokenizeMethod.SENTENCEPIECE, vocab_size=self.params["vocab_size"], model_path=tokenizer_path, ) else: X_windowed, y_windowed = X_train_valid_preprocessed, y_train_valid X_test_windowed, y_test_windowed = X_test_preprocessed, y_test print( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating window: Length {window_len}, pooling {pooling} ({len(X_windowed)} obs)" ) results = run_benchmark_experiment( f"{self.name}_{run.key}", X_windowed, y_windowed, model_cls, run.param_grid, test_dataset=(X_test_windowed, y_test_windowed), run_kwargs=run.run_kwargs, ) if window_len is not None: pooled_output = PredictOutput( y_pred_proba=results.y_pred_proba.copy()) pool_document_windows( pooled_output, X_test_windowed_indices, pooling=WindowPooling(pooling), ) all_results.append(results.metrics()) all_metrics = pd.DataFrame([{ "Window Config": f"Length {window_len}, pooling {pooling}", **r } for w, r in zip(self.params["window_len_poolings"], all_results)]) fig = plt.figure(figsize=(10, 10)) acc_ax = fig.add_subplot() all_metrics.plot(x="Window Config", y="Accuracy", ax=acc_ax, kind="bar") plt.xlabel("Document Windowing") plt.title( f"Model Performance by Document Windowing - {model_cls.__name__}") plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: AugmentRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) preprocess_func = PREPROCESS_FUNCS[self.params["preprocess_func"]] X_test_preprocessed = preprocess_func(X_test) model_cls = getattr(gobbli.model, self.params["model_name"]) assert_valid_augment(run.augment_name) augment_cls = getattr(gobbli.augment, run.augment_name) model_run_params: Dict[str, Any] = {} if issubclass(augment_cls, BaseModel): # If the augment method is also a gobbli model (and will be mounting files back- # and-forth with Docker), we need to make sure it has the proper params # applied ex. to store data in the correct place and use GPU(s) model_run_params = get_model_run_params() augment_obj = augment_cls(**run.params, **model_run_params) # Some augmentation methods are also models, which need to be built # beforehand if isinstance(augment_obj, BaseModel): augment_obj.build() all_results = [] for percent, multiplier in self.params["percent_multipliers"]: X_sampled, _, y_sampled, _ = train_test_split(X_train_valid, y_train_valid, train_size=percent, random_state=1) if multiplier == 0: X_augmented = X_sampled y_augmented = y_sampled else: X_augmented = X_sampled + augment_obj.augment( X_sampled, times=multiplier, p=self.params["augment_probability"]) y_augmented = y_sampled + (y_sampled * multiplier) print( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating multiplier x{multiplier}, percent {percent} ({len(X_augmented)} obs)" ) results = run_benchmark_experiment( f"{self.name}_{run.key}", preprocess_func(X_augmented), y_augmented, model_cls, self.params["param_grid"], test_dataset=(X_test_preprocessed, y_test), ) all_results.append(results.metrics()) all_metrics = pd.DataFrame([{ "percent": p, "multiplier": m, **r } for (p, m), r in zip(self.params["percent_multipliers"], all_results) ]) fig, ax = plt.subplots(figsize=(10, 10)) for key, grp in all_metrics.groupby("multiplier"): grp.plot( x="percent", y="Weighted F1 Score", kind="line", label=f"{key}x augmentation", ax=ax, ) plt.xlabel("Proportion of Data Used") plt.ylabel("Weighted F1 Score") plt.title( f"Model Performance by Proportion of Data Used - {model_cls.__name__}" ) plt.xlim(0, 1) plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) all_results = [] # Finish linting, test for proportion in self.params["data_proportions"]: X_sampled, _, y_sampled, _ = train_test_split( X_train_valid_preprocessed, y_train_valid, train_size=proportion, random_state=1, ) LOGGER.info( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating proportion {round(proportion, 3)} ({len(X_sampled)} obs)" ) results = run_benchmark_experiment( f"{self.name}_{run.key}", X_sampled, y_sampled, model_cls, run.param_grid, test_dataset=(X_test_preprocessed, y_test), run_kwargs=run.run_kwargs, ) all_results.append(results) all_metrics = pd.DataFrame([{ "data_proportion": p, "num_documents": int(p * len(X_train_valid)), **r.metrics(), } for p, r in zip(self.params["data_proportions"], all_results)]) fig = plt.figure(figsize=(10, 10)) f1_ax = fig.add_subplot() all_metrics.plot(x="num_documents", y="Weighted F1 Score", ax=f1_ax) acc_ax = fig.add_subplot() all_metrics.plot(x="num_documents", y="Accuracy", ax=acc_ax) plt.xlabel("Number of Documents Used for Training/Validation") plt.title( f"Model Performance by Number of Documents Used for Training/Validation - {model_cls.__name__}" ) plt.xlim(0, int(all_metrics["num_documents"].max() * 1.1)) plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md
def _do_run(self, run: ModelClassificationRun, run_output_dir: Path) -> str: ds = IMDBDataset.load() X_train_valid, y_train_valid, X_test, y_test = maybe_limit( ds.X_train(), ds.y_train(), ds.X_test(), ds.y_test(), self.dataset_limit) assert_in("preprocess_func", run.preprocess_func, PREPROCESS_FUNCS) preprocess_func = PREPROCESS_FUNCS[run.preprocess_func] X_train_valid_preprocessed = preprocess_func(X_train_valid) X_test_preprocessed = preprocess_func(X_test) assert_valid_model(run.model_name) model_cls = getattr(gobbli.model, run.model_name) all_results = [] majority, minority = ClassImbalanceScenario.find_majority_minority_classes( y_test) majority_df, minority_df = ClassImbalanceScenario.split_dataset( X_train_valid_preprocessed, y_train_valid, majority, minority) for proportion in self.params["imbalance_proportions"]: # Downsample the minority class so the final dataset contains the desired # proportion of the minority orig_len = majority_df.shape[0] downsample_proportion = -orig_len / (orig_len - orig_len / proportion) minority_sample = minority_df.sample( frac=downsample_proportion).reset_index() sampled_df = pd.concat([majority_df, minority_sample]) X = sampled_df["X"].tolist() y = sampled_df["y"].tolist() LOGGER.info( f"{dt.datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')} " f"Evaluating proportion {round(proportion, 3)} ({len(X)} obs)") results = run_benchmark_experiment( f"{self.name}_{run.key}", X, y, model_cls, run.param_grid, test_dataset=(X_test_preprocessed, y_test), run_kwargs=run.run_kwargs, ) all_results.append(results) minority_f1_scores = [] majority_f1_scores = [] for result in all_results: majority_f1, minority_f1 = f1_score( result.y_true, pred_prob_to_pred_label(result.y_pred_proba), average=None, labels=[majority, minority], ) minority_f1_scores.append(minority_f1) majority_f1_scores.append(majority_f1) all_metrics = pd.DataFrame([{ "imbalance_proportion": p, **r.metrics() } for p, r in zip(self.params["imbalance_proportions"], all_results)]) all_metrics["Minority Class F1 Score"] = minority_f1_scores all_metrics["Majority Class F1 Score"] = majority_f1_scores fig = plt.figure(figsize=(10, 10)) minority_ax = fig.add_subplot() all_metrics.plot(x="imbalance_proportion", y="Minority Class F1 Score", ax=minority_ax) majority_ax = fig.add_subplot() all_metrics.plot(x="imbalance_proportion", y="Majority Class F1 Score", ax=majority_ax) plt.xlabel("Prevalence of Minority Class") plt.title( f"Model Performance by Prevalence of Minority Class - {model_cls.__name__}" ) plt.xlim(0, 0.5) plt.ylim(0, 1) plot_path = run_output_dir / "plot.png" fig.savefig(plot_path) md = f"# Results: {run.key}\n" md += tabulate(all_metrics, tablefmt="pipe", headers="keys") md += f"\n\n![Results]({self.get_markdown_relative_path(plot_path)})\n---" return md