Exemplo n.º 1
0
def train_model(
    context: MLClientCtx,
    model_pkg_class: str,
    dataset: DataItem,
    label_column: str = "labels",
    encode_cols: List[str] = [],
    sample: int = -1,
    test_size: float = 0.30,
    train_val_split: float = 0.70,
    test_set_key: str = "test_set",
    model_evaluator = None,
    models_dest: str = "",
    plots_dest: str = "plots",
    file_ext: str = "parquet",
    model_pkg_file: str = "",
    random_state: int = 1,
) -> None:
    """train a classifier
    
    An optional cutom model evaluator can be supplied that should have the signature:
    `my_custom_evaluator(context, xvalid, yvalid, model)` and return a dictionary of 
    scalar "results", a "plots" keys with a list of PlotArtifacts, and 
    and "tables" key containing a returned list of TableArtifacts.
    
    :param context:           the function context
    :param model_pkg_class:   the model to train, e.g, "sklearn.neural_networks.MLPClassifier", 
                              or json model config
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param test_size:         (0.05) test set size
    :param train_val_split:   (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param test_set_key:      key of held out data in artifact store
    :param model_evaluator:   (None) a custom model evaluator can be specified
    :param models_dest:       ("") models subfolder on artifact path
    :param plots_dest:        plot subfolder on artifact path
    :param file_ext:          ("parquet") format for test_set_key hold out data
    :param random_state:      (1) sklearn rng seed

    """
    models_dest = models_dest or "model"
    
    raw, labels, header = get_sample(dataset, sample, label_column)
    
    if encode_cols:
        raw = pd.get_dummies(raw, 
                             columns=list(encode_cols.keys()), 
                             prefix=list(encode_cols.values()), 
                             drop_first=True)
    
    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) =         get_splits(raw, labels, 3, test_size, 1-train_val_split, random_state)
    
    context.log_dataset(test_set_key, 
                        df=pd.concat([xtest, ytest.to_frame()], axis=1),
                        format=file_ext, index=False, 
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": xtrain,
                                "y": ytrain.values})
    
    ClassifierClass = create_class(model_config["META"]["class"])
    
    model = ClassifierClass(**model_config["CLASS"])
    
    model.fit(**model_config["FIT"])
    
    artifact_path = context.artifact_subpath(models_dest)
    plots_path = context.artifact_subpath(models_dest, plots_dest)
    if model_evaluator:
        eval_metrics = model_evaluator(context, xvalid, yvalid, model,
                                       plots_artifact_path=plots_path)
    else:
        eval_metrics = eval_model_v2(context, xvalid, yvalid, model,
                                     plots_artifact_path=plots_path)
        
    context.set_label('class', model_pkg_class)
    context.log_model("model", body=dumps(model),
                      artifact_path=artifact_path,
                      extra_data=eval_metrics, 
                      model_file="model.pkl",
                      metrics=context.results,
                      labels={"class": model_pkg_class})
Exemplo n.º 2
0
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """

    if scheduler_key:
        client = Client(scheduler_key)

    else:
        client = Client()

    context.logger.info("Read Data")
    df = dataset.as_df(df_module=dd)

    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)

    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')

    df_header = df.columns

    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates()  # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=train_validation_size, random_state=random_state)

    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed, "y": y_train})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    with joblib.parallel_backend("dask"):

        model = model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):

        report_name = str(report.__name__)
        plt.cla()
        plt.clf()
        plt.close()

        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed,
                y_train)  # Fit the training data to the visualizer
        viz.score(X_test_transformed,
                  y_test.compute())  # Evaluate the model on the test data

        plot = context.log_artifact(PlotArtifact(report_name,
                                                 body=viz.fig,
                                                 title=report_name),
                                    db_key=False)
        extra_data_dict[str(report)] = plot

        if report_name == 'ROCAUC':
            context.log_results({
                "micro": viz.roc_auc.get("micro"),
                "macro": viz.roc_auc.get("macro")
            })

        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:

                    context.log_results({
                        score_name + "-" + score_class:
                        viz.scores_[score_name].get(score_class)
                    })

    viz = FeatureImportances(model,
                             classes=classes,
                             per_class=True,
                             is_fitted=True,
                             labels=df_header.delete(
                                 df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train)
    viz.score(X_test_transformed, y_test)

    plot = context.log_artifact(PlotArtifact("FeatureImportances",
                                             body=viz.fig,
                                             title="FeatureImportances"),
                                db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot

    plt.cla()
    plt.clf()
    plt.close()

    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.set_label('class', model_pkg_class)

    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})

    context.log_artifact("standard_scaler",
                         body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")

    context.log_artifact("label_encoder",
                         body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")

    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(
        test_set_key,
        df=pd.DataFrame(df_to_save,
                        columns=df_header),  # improve log dataset ability
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath('data'))

    context.logger.info("Done!")
Exemplo n.º 3
0
def feature_selection(
        context,
        df_artifact,
        k=2,
        min_votes=0.5,
        label_column: str = 'Y',
        stat_filters=[
            'f_classif', 'mutual_info_classif', 'chi2', 'f_regression'
        ],
        model_filters={
            'LinearSVC': 'LinearSVC',
            'LogisticRegression': 'LogisticRegression',
            'ExtraTreesClassifier': 'ExtraTreesClassifier'
        },
        max_scaled_scores=True):
    """Applies selected feature selection statistical functions
    or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context
    :param k:                 number of top features to select from each statistical
                              function or model
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute
                              number of votes
    :param label_column:      ground-truth (y) labels
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection)
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS',
                              'FIT', 'META') or a path to such json file.
    :param max_scaled_scores: produce feature scores table scaled with max_scaler
    """

    # Read input DF
    df_path = str(df_artifact)
    context.logger.info(f'input dataset {df_path}')
    if df_path.endswith('csv'):
        df = pd.read_csv(df_path)
    elif df_path.endswith('parquet') or df_path.endswith('pq'):
        df = pd.read_parquet(df_path)

    # Set feature vector and labels
    y = df.pop(label_column)
    X = df

    # Create selected statistical estimators
    stat_functions_list = {
        stat_name:
        SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k)
        for stat_name in stat_filters
    }
    requires_abs = ['chi2']

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns)
    for stat_name, stat_func in stat_functions_list.items():
        try:
            # Compute statistics
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
        except Exception as e:
            context.logger.info(
                f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(
        all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(
                **current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
        else:
            try:
                current_model = json.loads(model) if isinstance(
                    model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(
                    **current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_
        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores',
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf],
                                                 np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(
            key='max_scaled_scores_feature_scores',
            df=normalized_df,
            local_path='max_scaled_scores_feature_scores.parquet',
            format='parquet')

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [
            1 if x in selected_features_agg[test_name] else 0
            for x in X.columns
        ]
    result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count',
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')

    # Create final feature dataframe
    selected_features = result_matrix_df[
        result_matrix_df.num_votes >= votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')
Exemplo n.º 4
0
def feature_selection(context,
                      df_artifact,
                      k: int=5,
                      min_votes: float=0.5,
                      label_column: str=None,
                      stat_filters: list=['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'],
                      model_filters: dict={'LinearSVC': 'LinearSVC',
                                     'LogisticRegression': 'LogisticRegression',
                                     'ExtraTreesClassifier': 'ExtraTreesClassifier'},
                      max_scaled_scores: bool=True,
                      sample_ratio: float=None,
                      output_vector_name: float=None,
                      ignore_type_errors: bool=False,
                      is_feature_vector: bool=False):
    
    """Applies selected feature selection statistical functions
    or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context.
    
    :param k:                 number of top features to select from each statistical
                              function or model.
                              
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute
                              number of votes.
                              
    :param label_column:      ground-truth (y) labels.
    
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection).
                              
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS',
                              'FIT', 'META') or a path to such json file.
                              
    :param max_scaled_scores: produce feature scores table scaled with max_scaler.

    :param sample_ratio: percentage of the dataset the user whishes to compute the feature selection process on.
    
    :param output_vector_name: creates a new feature vector containing only the identifies features.
    
    :param ignore_type_errors: skips datatypes that are neither float or int within the feature vector.
    
    :param is_feature_vector: bool stating if the data is passed as a feature vector.
    """
        
    # Check if df.meta is valid, if it is, look for a feature vector
    if df_artifact.meta:
        if df_artifact.meta.kind == mlrun.api.schemas.ObjectKind.feature_vector:
            is_feature_vector = True
    
    # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it
    if label_column is None:
        if is_feature_vector:
            label_column = df_artifact.meta.spec.label_feature.split('.')[1]
        else:
            raise ValueError('No label_column was given, please add a label_column.')
    
    # Use the feature vector as dataframe
    df = df_artifact.as_df()
    
    # Ensure k is not bigger than the the total number of features
    if k > df.shape[1]:
        raise ValueError(f'K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K.')
    elif k < 1:
        raise ValueError(f'K cannot be smaller than 1. Please choose a bigger K.')
        
    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = df.groupby(label_column).apply(lambda x: x.sample(frac=sample_ratio)).reset_index(drop=True)
        df = df.dropna()
        
    # Set feature vector and labels
    y = df.pop(label_column)
    X = df
    
    if np.object in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int.")
        
    # Create selected statistical estimators
    stat_functions_list = {stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k)
                           for stat_name in stat_filters}
    requires_abs = ['chi2']

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()
                
    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
            
        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
            
        else:
            try:
                current_model = json.loads(model) if isinstance(model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        

        if model_name == 'LogisticRegression':
            model.set_params(solver='liblinear')
            
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores',
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(key='max_scaled_scores_feature_scores',
                            df=normalized_df,
                            local_path='max_scaled_scores_feature_scores.parquet',
                            format='parquet')

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns]
    result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count',
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')

    # Create final feature dataframe
    selected_features = result_matrix_df[result_matrix_df.num_votes >= votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')
    
    # Creating a new feature vector containing only the identified top features
    if is_feature_vector and df_artifact.meta.spec.features and output_vector_name:

        # Selecting the top K features from our top feature dataframe
        selected_features = result_matrix_df.head(k).index

        # Match the selected feature names to the FS Feature annotations
        matched_selections = [feature for feature in list(df_artifact.meta.spec.features) for selected in list(selected_features) if feature.endswith(selected)]

        # Defining our new feature vector
        top_features_fv = fs.FeatureVector(output_vector_name, 
                                    matched_selections, 
                                    label_feature="labels.label",
                                    description='feature vector composed strictly of our top features')

        # Saving
        top_features_fv.save()
        fs.get_offline_features(top_features_fv, target=ParquetTarget())

        # Logging our new feature vector URI
        context.log_result('top_features_vector', top_features_fv.uri)
Exemplo n.º 5
0
def train_model(
    context: MLClientCtx,
    model_type: str,
    dataset: Union[DataItem, pd.core.frame.DataFrame],
    label_column: str = "labels",
    encode_cols: dict = {},
    sample: int = -1,
    imbal_vec=[],
    test_size: float = 0.25,
    valid_size: float = 0.75,
    random_state: int = 1,
    models_dest: str = "models",
    plots_dest: str = "plots",
    eval_metrics: list = ["error", "auc"],
    file_ext: str = "parquet",
    test_set: str = "test_set",
) -> None:
    """train an xgboost model.

    Note on imabalanced data:  the `imbal_vec` parameter represents the measured
    class representations in the sample and can be used as a first step in tuning
    an XGBoost model.  This isn't a hyperparamter, merely an estimate that should
    be set as 'constant' throughout tuning process.

    :param context:           the function context
    :param model_type:        the model type to train, "classifier", "regressor"...
    :param dataset:           ("data") name of raw data file
    :param label_column:      ground-truth (y) labels
    :param encode_cols:       dictionary of names and prefixes for columns that are
                              to hot be encoded.
    :param sample:            Selects the first n rows, or select a sample
                              starting from the first. If negative <-1, select
                              a random sample
    :param imbal_vec:         ([]) vector of class weights seen in sample
    :param test_size:         (0.05) test set size
    :param valid_size:        (0.75) Once the test set has been removed the
                              training set gets this proportion.
    :param random_state:      (1) sklearn rng seed
    :param models_dest:       destination subfolder for model artifacts
    :param plots_dest:        destination subfolder for plot artifacts
    :param eval_metrics:      (["error", "auc"]) learning curve metrics
    :param file_ext:          format for test_set_key hold out data
    :param test-set:          (test_set) key of held out data in artifact store
    """
    models_dest = models_dest or "models"
    plots_dest = plots_dest or f"plots/{context.name}"

    raw, labels, header = get_sample(dataset, sample, label_column)

    if encode_cols:
        raw = pd.get_dummies(
            raw,
            columns=list(encode_cols.keys()),
            prefix=list(encode_cols.values()),
            drop_first=True,
        )

    (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = get_splits(
        raw, labels, 3, test_size, valid_size, random_state)

    context.log_dataset(test_set,
                        df=pd.concat([xtest, ytest], axis=1),
                        format=file_ext,
                        index=False)

    model_config = _gen_xgb_model(model_type, context.parameters.items())

    XGBBoostClass = create_class(model_config["META"]["class"])
    model = XGBBoostClass(**model_config["CLASS"])

    model_config["FIT"].update({
        "X": xtrain,
        "y": ytrain.values,
        "eval_set": [(xtrain, ytrain), (xvalid, yvalid)],
        "eval_metric": eval_metrics,
    })

    model.fit(**model_config["FIT"])

    eval_metrics = eval_model_v2(context, xvalid, yvalid, model)

    model_bin = dumps(model)
    context.log_model(
        "model",
        body=model_bin,
        artifact_path=os.path.join(context.artifact_path, models_dest),
        model_file="model.pkl",
    )