def load_california_housing(data_dir: str) -> Dict[str, pd.DataFrame]: data_path = _get_full_data_path(data_dir, "california_housing", "housing.csv") logger.info(f"Loading dataframe from {data_path}") return { "housing": pd.read_csv(DATA_SOURCE + "california_housing/housing.csv") }
def load_or_train_model(model_path, pipeline_config_path) -> BaseModelWrapper: """Loads or trains a model, returns a model wrapper.""" if not (model_path or pipeline_config_path): raise ValueError( "At least one of model_path and pipeline_config_path must be not None." ) if not pipeline_config_path: pipeline_config_path = ExperimentManger.get_config_path_from_model_path( model_path) if not model_path: try: logger.info( f"Searching for the last run dir with {pipeline_config_path} config." ) run_dir = ExperimentManger( pipeline_config_path).get_most_recent_run_dir() # TODO: create a function/method in experiment_manager to find model_path # in run_dir. model_path = str(Path(run_dir) / "model_0") except IOError: import tabml.pipelines pipeline = tabml.pipelines.BasePipeline(pipeline_config_path) pipeline.run() return pipeline.model_wrapper config = parse_pipeline_config(pipeline_config_path) return initialize_model_wrapper(config.model_wrapper, model_path)
def _analyze_on_feature(self, df_with_pred: pd.DataFrame, feature_name: str, dataset_name: str): """Analyzes the predictions based on one feature.""" # get list of metric score dict for each value in feature_name list_of_group_dicts = [ self._get_metric_dict(feature_name, feature_value, group) for feature_value, group in df_with_pred.groupby(feature_name) ] # add overall score overall_scores = self._get_metric_dict(feature_name, "OVERALL", df_with_pred) list_of_group_dicts.append(overall_scores) if self._show_overall_flag: self._show_and_log_overall_scores(overall_scores, dataset_name) self._show_overall_flag = False df_group = pd.DataFrame(list_of_group_dicts).sort_values( by=self.metrics[0].score_names[0], ascending=self.metrics[0].is_higher_better, ) saved_path = self._get_df_feature_metric_csv_path( dataset_name, feature_name) df_group.to_csv(saved_path, index=False) logger.info( f"Saved model analysis slicing against {feature_name} to {saved_path}" )
def load_titanic(data_dir: str) -> Dict[str, pd.DataFrame]: """Loads data from data_dir folder if they exist, downloads otherwise.""" train_path = _get_full_data_path(data_dir, "titanic", "train.csv") test_path = _get_full_data_path(data_dir, "titanic", "test.csv") gender_submission_path = _get_full_data_path(data_dir, "titanic", "gender_submission.csv") logger.info(f"Loading dataframes from {train_path}, {test_path}, and " f"{gender_submission_path}") return { "train": pd.read_csv(train_path), "test": pd.read_csv(test_path), "gender_submission": pd.read_csv(gender_submission_path), }
def save_as_pickle(an_object: Any, path: str, filename: str) -> None: """Saves an object as a pickle file. Args: an_object: A python object. Can be list, dict etc. path: The path where to save. filename: The filename """ if not Path(path).exists(): Path(path).mkdir(parents=True, exist_ok=True) file_path = Path(path) / filename with open(file_path, "wb") as handle: pickle.dump(an_object, handle) logger.info(f"File is saved successfully to {file_path}.")
def analyze_model(self) -> None: """Analyze the model on the validation dataset. The trained model is evaluated based on metrics for predictions slicing by each categorical feature specified by features_to_analyze. """ logger.info("Model Analysis") assert len(self.config.model_analysis.metrics) > 0, ( "At least one metrics in model_analysis must be specified. " "Add the metrics in model_analysis in the pipeline config") assert len(self.config.model_analysis.by_features) > 0, ( "At least one by_features in model_analysis must be specified. " "Add the by_features in model_analysis in the pipeline config") ModelAnalysis( data_loader=self.data_loader, model_wrapper=self.model_wrapper, params=self.config.model_analysis, output_dir=self.exp_manager.get_model_analysis_dir(), ).analyze()
def show_feature_importance(data: typing.Dict[str, float]) -> None: """ Shows feature importance in terminal. Importances are shown in desecending order. Note that termgraph (a tool for visualize graph in terminal) only shows meaningful graphs with positive values. Fortunately, XGB model only outputs positive feature importances. If LGBM and Keras, if any, have negative feature importance, then a quick modification is to visualize absolute values. In fact, the magnitude of feature importances are more _important_ than their actual values. Args: data: a dictionary of {feature: imporatnce} Raises: ValueError if any importance is negative. """ assert data is not None, "input dictionary can not be empty" tmp_file = tempfile.NamedTemporaryFile() # sort feature by desecending importance feature_importance_tuples = sorted(data.items(), key=lambda x: -x[1]) if feature_importance_tuples[-1][-1] < 0: raise ValueError( f"All feature importances need to be non-negative, got data = {data}" ) # write to file # TODO: find a way to visualize data directly rather than saving it in an # intermediate file. with open(tmp_file.name, "w") as fout: for feature, importance in feature_importance_tuples: fout.write(f"{feature}, {importance}\n") logger.info("Feature importance:") logger.info(subprocess.getoutput(f"termgraph {tmp_file.name}"))
def _show_and_log_overall_scores(self, overall_scores: Dict[str, Any], dataset_name: str) -> None: logger.info("=" * 20 + f" OVERALL SCORES on {dataset_name} dataset " + "=" * 20) logger.info("{:<20}: {}".format("Num samples", overall_scores["sample_count"])) for key, val in overall_scores.items(): if val == "OVERALL" or key == "sample_count": continue logger.info("{:<20}: {}".format(key, val)) mlflow.log_metrics({key: val})
def __init__(self, path_to_config: str, custom_run_dir: str = ""): logger.info("=" * 80) logger.info(f"Running pipeline with config {path_to_config}") logger.info("=" * 80) self.exp_manager = experiment_manager.ExperimentManger( path_to_config, custom_run_dir=Path(custom_run_dir)) self.config = parse_pipeline_config(path_to_config) self.data_loader = self._get_data_loader() assert self.data_loader.label_col is not None, "label_col must be specified" self.model_wrapper = model_wrappers.initialize_model_wrapper( self.config.model_wrapper) logger.add(self.exp_manager.get_log_path())
def train(self): model_dir = self.exp_manager.run_dir logger.info("Start training the model.") self.model_wrapper.fit(self.data_loader, model_dir)