def train_model( context: MLClientCtx, model_pkg_class: str, dataset: DataItem, label_column: str = "labels", encode_cols: List[str] = [], sample: int = -1, test_size: float = 0.30, train_val_split: float = 0.70, test_set_key: str = "test_set", model_evaluator = None, models_dest: str = "", plots_dest: str = "plots", file_ext: str = "parquet", model_pkg_file: str = "", random_state: int = 1, ) -> None: """train a classifier An optional cutom model evaluator can be supplied that should have the signature: `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of scalar "results", a "plots" keys with a list of PlotArtifacts, and and "tables" key containing a returned list of TableArtifacts. :param context: the function context :param model_pkg_class: the model to train, e.g, "sklearn.neural_networks.MLPClassifier", or json model config :param dataset: ("data") name of raw data file :param label_column: ground-truth (y) labels :param encode_cols: dictionary of names and prefixes for columns that are to hot be encoded. :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param test_size: (0.05) test set size :param train_val_split: (0.75) Once the test set has been removed the training set gets this proportion. :param test_set_key: key of held out data in artifact store :param model_evaluator: (None) a custom model evaluator can be specified :param models_dest: ("") models subfolder on artifact path :param plots_dest: plot subfolder on artifact path :param file_ext: ("parquet") format for test_set_key hold out data :param random_state: (1) sklearn rng seed """ models_dest = models_dest or "model" raw, labels, header = get_sample(dataset, sample, label_column) if encode_cols: raw = pd.get_dummies(raw, columns=list(encode_cols.keys()), prefix=list(encode_cols.values()), drop_first=True) (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(raw, labels, 3, test_size, 1-train_val_split, random_state) context.log_dataset(test_set_key, df=pd.concat([xtest, ytest.to_frame()], axis=1), format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": xtrain, "y": ytrain.values}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) if model_evaluator: eval_metrics = model_evaluator(context, xvalid, yvalid, model, plots_artifact_path=plots_path) else: eval_metrics = eval_model_v2(context, xvalid, yvalid, model, plots_artifact_path=plots_path) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, extra_data=eval_metrics, model_file="model.pkl", metrics=context.results, labels={"class": model_pkg_class})
def train_model(context: MLClientCtx, dataset: DataItem, model_pkg_class: str, label_column: str = "label", train_validation_size: float = 0.75, sample: float = 1.0, models_dest: str = "models", test_set_key: str = "test_set", plots_dest: str = "plots", dask_key: str = "dask_key", dask_persist: bool = False, scheduler_key: str = '', file_ext: str = "parquet", random_state: int = 42) -> None: """ Train a sklearn classifier with Dask :param context: Function context. :param dataset: Raw data file. :param model_pkg_class: Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", or json model config. :param label_column: (label) Ground-truth y labels. :param train_validation_size: (0.75) Train validation set proportion out of the full dataset. :param sample: (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default. :param models_dest: (models) Models subfolder on artifact path. :param test_set_key: (test_set) Mlrun db key of held out data in artifact store. :param plots_dest: (plots) Plot subfolder on artifact path. :param dask_key: (dask key) Key of dataframe in dask client "datasets" attribute. :param dask_persist: (False) Should the data be persisted (through the `client.persist`) :param scheduler_key: (scheduler) Dask scheduler configuration, json also logged as an artifact. :param file_ext: (parquet) format for test_set_key hold out data :param random_state: (42) sklearn seed """ if scheduler_key: client = Client(scheduler_key) else: client = Client() context.logger.info("Read Data") df = dataset.as_df(df_module=dd) context.logger.info("Prep Data") numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] df = df.select_dtypes(include=numerics) if df.isna().any().any().compute() == True: raise Exception('NAs valus found') df_header = df.columns df = df.sample(frac=sample).reset_index(drop=True) encoder = LabelEncoder() encoder = encoder.fit(df[label_column]) X = df.drop(label_column, axis=1).to_dask_array(lengths=True) y = encoder.transform(df[label_column]) classes = df[label_column].drop_duplicates() # no unique values in dask classes = [str(i) for i in classes] context.logger.info("Split and Train") X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size=train_validation_size, random_state=random_state) scaler = StandardScaler() scaler = scaler.fit(X_train) X_train_transformed = scaler.transform(X_train) X_test_transformed = scaler.transform(X_test) model_config = gen_sklearn_model(model_pkg_class, context.parameters.items()) model_config["FIT"].update({"X": X_train_transformed, "y": y_train}) ClassifierClass = create_class(model_config["META"]["class"]) model = ClassifierClass(**model_config["CLASS"]) with joblib.parallel_backend("dask"): model = model.fit(**model_config["FIT"]) artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.logger.info("Evaluate") extra_data_dict = {} for report in (ROCAUC, ClassificationReport, ConfusionMatrix): report_name = str(report.__name__) plt.cla() plt.clf() plt.close() viz = report(model, classes=classes, per_class=True, is_fitted=True) viz.fit(X_train_transformed, y_train) # Fit the training data to the visualizer viz.score(X_test_transformed, y_test.compute()) # Evaluate the model on the test data plot = context.log_artifact(PlotArtifact(report_name, body=viz.fig, title=report_name), db_key=False) extra_data_dict[str(report)] = plot if report_name == 'ROCAUC': context.log_results({ "micro": viz.roc_auc.get("micro"), "macro": viz.roc_auc.get("macro") }) elif report_name == 'ClassificationReport': for score_name in viz.scores_: for score_class in viz.scores_[score_name]: context.log_results({ score_name + "-" + score_class: viz.scores_[score_name].get(score_class) }) viz = FeatureImportances(model, classes=classes, per_class=True, is_fitted=True, labels=df_header.delete( df_header.get_loc(label_column))) viz.fit(X_train_transformed, y_train) viz.score(X_test_transformed, y_test) plot = context.log_artifact(PlotArtifact("FeatureImportances", body=viz.fig, title="FeatureImportances"), db_key=False) extra_data_dict[str("FeatureImportances")] = plot plt.cla() plt.clf() plt.close() context.logger.info("Log artifacts") artifact_path = context.artifact_subpath(models_dest) plots_path = context.artifact_subpath(models_dest, plots_dest) context.set_label('class', model_pkg_class) context.log_model("model", body=dumps(model), artifact_path=artifact_path, model_file="model.pkl", extra_data=extra_data_dict, metrics=context.results, labels={"class": model_pkg_class}) context.log_artifact("standard_scaler", body=dumps(scaler), artifact_path=artifact_path, model_file="scaler.gz", label="standard_scaler") context.log_artifact("label_encoder", body=dumps(encoder), artifact_path=artifact_path, model_file="encoder.gz", label="label_encoder") df_to_save = delayed(np.column_stack)((X_test, y_test)).compute() context.log_dataset( test_set_key, df=pd.DataFrame(df_to_save, columns=df_header), # improve log dataset ability format=file_ext, index=False, labels={"data-type": "held-out"}, artifact_path=context.artifact_subpath('data')) context.logger.info("Done!")
def feature_selection( context, df_artifact, k=2, min_votes=0.5, label_column: str = 'Y', stat_filters=[ 'f_classif', 'mutual_info_classif', 'chi2', 'f_regression' ], model_filters={ 'LinearSVC': 'LinearSVC', 'LogisticRegression': 'LogisticRegression', 'ExtraTreesClassifier': 'ExtraTreesClassifier' }, max_scaled_scores=True): """Applies selected feature selection statistical functions or models on our 'df_artifact'. Each statistical function or model will vote for it's best K selected features. If a feature has >= 'min_votes' votes, it will be selected. :param context: the function context :param k: number of top features to select from each statistical function or model :param min_votes: minimal number of votes (from a model or by statistical function) needed for a feature to be selected. Can be specified by percentage of votes or absolute number of votes :param label_column: ground-truth (y) labels :param stat_filters: statistical functions to apply to the features (from sklearn.feature_selection) :param model_filters: models to use for feature evaluation, can be specified by model name (ex. LinearSVC), formalized json (contains 'CLASS', 'FIT', 'META') or a path to such json file. :param max_scaled_scores: produce feature scores table scaled with max_scaler """ # Read input DF df_path = str(df_artifact) context.logger.info(f'input dataset {df_path}') if df_path.endswith('csv'): df = pd.read_csv(df_path) elif df_path.endswith('parquet') or df_path.endswith('pq'): df = pd.read_parquet(df_path) # Set feature vector and labels y = df.pop(label_column) X = df # Create selected statistical estimators stat_functions_list = { stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k) for stat_name in stat_filters } requires_abs = ['chi2'] # Run statistic filters selected_features_agg = {} stats_df = pd.DataFrame(index=X.columns) for stat_name, stat_func in stat_functions_list.items(): try: # Compute statistics params = (X, y) if stat_name in requires_abs else (abs(X), y) stat = stat_func.fit(*params) # Collect stat function results stat_df = pd.DataFrame(index=X.columns, columns=[stat_name], data=stat.scores_) plot_stat(context, stat_name, stat_df) stats_df = stats_df.join(stat_df) # Select K Best features selected_features = X.columns[stat_func.get_support()] selected_features_agg[stat_name] = selected_features except Exception as e: context.logger.info( f"Couldn't calculate {stat_name} because of: {e}") # Create models from class name / json file / json params all_sklearn_estimators = dict( all_estimators()) if len(model_filters) > 0 else {} selected_models = {} for model_name, model in model_filters.items(): if '.json' in model: current_model = json.load(open(model, 'r')) ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass( **current_model["CLASS"]) elif model in all_sklearn_estimators: selected_models[model_name] = all_sklearn_estimators[model_name]() else: try: current_model = json.loads(model) if isinstance( model, str) else current_model ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass( **current_model["CLASS"]) except: context.logger.info(f'unable to load {model}') # Run model filters models_df = pd.DataFrame(index=X.columns) for model_name, model in selected_models.items(): # Train model and get feature importance select_from_model = SelectFromModel(model).fit(X, y) feature_idx = select_from_model.get_support() feature_names = X.columns[feature_idx] selected_features_agg[model_name] = feature_names.tolist() # Collect model feature importance if hasattr(select_from_model.estimator_, 'coef_'): stat_df = select_from_model.estimator_.coef_ elif hasattr(select_from_model.estimator_, 'feature_importances_'): stat_df = select_from_model.estimator_.feature_importances_ stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0]) models_df = models_df.join(stat_df) plot_stat(context, model_name, stat_df) # Create feature_scores DF with stat & model filters scores result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False) context.log_dataset(key='feature_scores', df=result_matrix_df, local_path='feature_scores.parquet', format='parquet') if max_scaled_scores: normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values min_max_scaler = MinMaxScaler() normalized_df = min_max_scaler.fit_transform(normalized_df) normalized_df = pd.DataFrame(data=normalized_df, columns=result_matrix_df.columns, index=result_matrix_df.index) context.log_dataset( key='max_scaled_scores_feature_scores', df=normalized_df, local_path='max_scaled_scores_feature_scores.parquet', format='parquet') # Create feature count DataFrame for test_name in selected_features_agg: result_matrix_df[test_name] = [ 1 if x in selected_features_agg[test_name] else 0 for x in X.columns ] result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1) context.log_dataset(key='selected_features_count', df=result_matrix_df, local_path='selected_features_count.parquet', format='parquet') # How many votes are needed for a feature to be selected? if isinstance(min_votes, int): votes_needed = min_votes else: num_filters = len(stat_filters) + len(model_filters) votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0))) context.logger.info(f'votes needed to be selected: {votes_needed}') # Create final feature dataframe selected_features = result_matrix_df[ result_matrix_df.num_votes >= votes_needed].index.tolist() good_feature_df = df.loc[:, selected_features] final_df = pd.concat([good_feature_df, y], axis=1) context.log_dataset(key='selected_features', df=final_df, local_path='selected_features.parquet', format='parquet')
def feature_selection(context, df_artifact, k: int=5, min_votes: float=0.5, label_column: str=None, stat_filters: list=['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'], model_filters: dict={'LinearSVC': 'LinearSVC', 'LogisticRegression': 'LogisticRegression', 'ExtraTreesClassifier': 'ExtraTreesClassifier'}, max_scaled_scores: bool=True, sample_ratio: float=None, output_vector_name: float=None, ignore_type_errors: bool=False, is_feature_vector: bool=False): """Applies selected feature selection statistical functions or models on our 'df_artifact'. Each statistical function or model will vote for it's best K selected features. If a feature has >= 'min_votes' votes, it will be selected. :param context: the function context. :param k: number of top features to select from each statistical function or model. :param min_votes: minimal number of votes (from a model or by statistical function) needed for a feature to be selected. Can be specified by percentage of votes or absolute number of votes. :param label_column: ground-truth (y) labels. :param stat_filters: statistical functions to apply to the features (from sklearn.feature_selection). :param model_filters: models to use for feature evaluation, can be specified by model name (ex. LinearSVC), formalized json (contains 'CLASS', 'FIT', 'META') or a path to such json file. :param max_scaled_scores: produce feature scores table scaled with max_scaler. :param sample_ratio: percentage of the dataset the user whishes to compute the feature selection process on. :param output_vector_name: creates a new feature vector containing only the identifies features. :param ignore_type_errors: skips datatypes that are neither float or int within the feature vector. :param is_feature_vector: bool stating if the data is passed as a feature vector. """ # Check if df.meta is valid, if it is, look for a feature vector if df_artifact.meta: if df_artifact.meta.kind == mlrun.api.schemas.ObjectKind.feature_vector: is_feature_vector = True # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it if label_column is None: if is_feature_vector: label_column = df_artifact.meta.spec.label_feature.split('.')[1] else: raise ValueError('No label_column was given, please add a label_column.') # Use the feature vector as dataframe df = df_artifact.as_df() # Ensure k is not bigger than the the total number of features if k > df.shape[1]: raise ValueError(f'K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K.') elif k < 1: raise ValueError(f'K cannot be smaller than 1. Please choose a bigger K.') # Create a sample dataframe of the original feature vector if sample_ratio: df = df.groupby(label_column).apply(lambda x: x.sample(frac=sample_ratio)).reset_index(drop=True) df = df.dropna() # Set feature vector and labels y = df.pop(label_column) X = df if np.object in list(X.dtypes) and ignore_type_errors is False: raise ValueError(f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int.") # Create selected statistical estimators stat_functions_list = {stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k) for stat_name in stat_filters} requires_abs = ['chi2'] # Run statistic filters selected_features_agg = {} stats_df = pd.DataFrame(index=X.columns).dropna() for stat_name, stat_func in stat_functions_list.items(): try: params = (X, y) if stat_name in requires_abs else (abs(X), y) stat = stat_func.fit(*params) # Collect stat function results stat_df = pd.DataFrame(index=X.columns, columns=[stat_name], data=stat.scores_) plot_stat(context, stat_name, stat_df) stats_df = stats_df.join(stat_df) # Select K Best features selected_features = X.columns[stat_func.get_support()] selected_features_agg[stat_name] = selected_features except Exception as e: context.logger.info(f"Couldn't calculate {stat_name} because of: {e}") # Create models from class name / json file / json params all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {} selected_models = {} for model_name, model in model_filters.items(): if '.json' in model: current_model = json.load(open(model, 'r')) ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass(**current_model["CLASS"]) elif model in all_sklearn_estimators: selected_models[model_name] = all_sklearn_estimators[model_name]() else: try: current_model = json.loads(model) if isinstance(model, str) else current_model ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass(**current_model["CLASS"]) except: context.logger.info(f'unable to load {model}') # Run model filters models_df = pd.DataFrame(index=X.columns) for model_name, model in selected_models.items(): if model_name == 'LogisticRegression': model.set_params(solver='liblinear') # Train model and get feature importance select_from_model = SelectFromModel(model).fit(X, y) feature_idx = select_from_model.get_support() feature_names = X.columns[feature_idx] selected_features_agg[model_name] = feature_names.tolist() # Collect model feature importance if hasattr(select_from_model.estimator_, 'coef_'): stat_df = select_from_model.estimator_.coef_ elif hasattr(select_from_model.estimator_, 'feature_importances_'): stat_df = select_from_model.estimator_.feature_importances_ stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0]) models_df = models_df.join(stat_df) plot_stat(context, model_name, stat_df) # Create feature_scores DF with stat & model filters scores result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False) context.log_dataset(key='feature_scores', df=result_matrix_df, local_path='feature_scores.parquet', format='parquet') if max_scaled_scores: normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values min_max_scaler = MinMaxScaler() normalized_df = min_max_scaler.fit_transform(normalized_df) normalized_df = pd.DataFrame(data=normalized_df, columns=result_matrix_df.columns, index=result_matrix_df.index) context.log_dataset(key='max_scaled_scores_feature_scores', df=normalized_df, local_path='max_scaled_scores_feature_scores.parquet', format='parquet') # Create feature count DataFrame for test_name in selected_features_agg: result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns] result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1) context.log_dataset(key='selected_features_count', df=result_matrix_df, local_path='selected_features_count.parquet', format='parquet') # How many votes are needed for a feature to be selected? if isinstance(min_votes, int): votes_needed = min_votes else: num_filters = len(stat_filters) + len(model_filters) votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0))) context.logger.info(f'votes needed to be selected: {votes_needed}') # Create final feature dataframe selected_features = result_matrix_df[result_matrix_df.num_votes >= votes_needed].index.tolist() good_feature_df = df.loc[:, selected_features] final_df = pd.concat([good_feature_df, y], axis=1) context.log_dataset(key='selected_features', df=final_df, local_path='selected_features.parquet', format='parquet') # Creating a new feature vector containing only the identified top features if is_feature_vector and df_artifact.meta.spec.features and output_vector_name: # Selecting the top K features from our top feature dataframe selected_features = result_matrix_df.head(k).index # Match the selected feature names to the FS Feature annotations matched_selections = [feature for feature in list(df_artifact.meta.spec.features) for selected in list(selected_features) if feature.endswith(selected)] # Defining our new feature vector top_features_fv = fs.FeatureVector(output_vector_name, matched_selections, label_feature="labels.label", description='feature vector composed strictly of our top features') # Saving top_features_fv.save() fs.get_offline_features(top_features_fv, target=ParquetTarget()) # Logging our new feature vector URI context.log_result('top_features_vector', top_features_fv.uri)
def train_model( context: MLClientCtx, model_type: str, dataset: Union[DataItem, pd.core.frame.DataFrame], label_column: str = "labels", encode_cols: dict = {}, sample: int = -1, imbal_vec=[], test_size: float = 0.25, valid_size: float = 0.75, random_state: int = 1, models_dest: str = "models", plots_dest: str = "plots", eval_metrics: list = ["error", "auc"], file_ext: str = "parquet", test_set: str = "test_set", ) -> None: """train an xgboost model. Note on imabalanced data: the `imbal_vec` parameter represents the measured class representations in the sample and can be used as a first step in tuning an XGBoost model. This isn't a hyperparamter, merely an estimate that should be set as 'constant' throughout tuning process. :param context: the function context :param model_type: the model type to train, "classifier", "regressor"... :param dataset: ("data") name of raw data file :param label_column: ground-truth (y) labels :param encode_cols: dictionary of names and prefixes for columns that are to hot be encoded. :param sample: Selects the first n rows, or select a sample starting from the first. If negative <-1, select a random sample :param imbal_vec: ([]) vector of class weights seen in sample :param test_size: (0.05) test set size :param valid_size: (0.75) Once the test set has been removed the training set gets this proportion. :param random_state: (1) sklearn rng seed :param models_dest: destination subfolder for model artifacts :param plots_dest: destination subfolder for plot artifacts :param eval_metrics: (["error", "auc"]) learning curve metrics :param file_ext: format for test_set_key hold out data :param test-set: (test_set) key of held out data in artifact store """ models_dest = models_dest or "models" plots_dest = plots_dest or f"plots/{context.name}" raw, labels, header = get_sample(dataset, sample, label_column) if encode_cols: raw = pd.get_dummies( raw, columns=list(encode_cols.keys()), prefix=list(encode_cols.values()), drop_first=True, ) (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits( raw, labels, 3, test_size, valid_size, random_state) context.log_dataset(test_set, df=pd.concat([xtest, ytest], axis=1), format=file_ext, index=False) model_config = _gen_xgb_model(model_type, context.parameters.items()) XGBBoostClass = create_class(model_config["META"]["class"]) model = XGBBoostClass(**model_config["CLASS"]) model_config["FIT"].update({ "X": xtrain, "y": ytrain.values, "eval_set": [(xtrain, ytrain), (xvalid, yvalid)], "eval_metric": eval_metrics, }) model.fit(**model_config["FIT"]) eval_metrics = eval_model_v2(context, xvalid, yvalid, model) model_bin = dumps(model) context.log_model( "model", body=model_bin, artifact_path=os.path.join(context.artifact_path, models_dest), model_file="model.pkl", )