Пример #1
0
    def permutation_importance(self,
                               test_sample,
                               test_target,
                               node_params=None,
                               **kwargs):
        """
        Calculates permutation importance of features.
        If node_params is not None, then plots graph weighted by permutation importance.

        :param test_sample: test feature subsample
        :param test_target: vector of targets for test sample
        :param node_params: mapping describes which node should be highlighted by target or source type
            Node param should be represented in the following form
            ```{
                    'lost': 'bad_target',
                    'passed': 'nice_target',
                    'onboarding_welcome_screen': 'source',
                }```
            If mapping is not given, it will be constracted from config
        :return: Nothing
        """
        self.show_quality_metrics(test_sample, test_target)
        if hasattr(self.mod, 'coef_'):
            self._plot_perm_imp(__LogRegWrapper__(self.mod.coef_[0]),
                                test_sample, node_params, **kwargs)
            return
        perm = PermutationImportance(self.mod, random_state=0).fit(
            test_sample, test_target)
        eli5.show_weights(perm,
                          feature_names=[
                              ' '.join(i) if type(i) == tuple else i
                              for i in test_sample.columns
                          ])
        self._plot_perm_imp(perm, test_sample, node_params, **kwargs)
Пример #2
0
 def feature_importance_permutation(self, random_state=123):
     '''
     calculate and display feature importance using the permutation method
     must have eli5 installed
     '''
     perm = PermutationImportance(self.model, random_state).fit(X_val, y_val)
     eli5.show_weights(perm, feature_names = X_val.columns.tolist())
Пример #3
0
def feat_rank(clf_df):
    for i, clf in clf_df.iterrows():

        classifier = clf['Classifier']
        p = classifier.fit(X_train, y_train)
        perm = PermutationImportance(p).fit(X_test, y_test)
        eli5.show_weights(perm)

    return
Пример #4
0
    def permutation_importance(self,
                               test_sample,
                               test_target,
                               node_params=None,
                               **kwargs):
        """
        Calculates permutation importance of features.
        If ``node_params`` is not `None`, then plots graph weighted by permutation importance.

        Parameters
        -------
        test_sample: pd.DataFrame
            Test feature subsample.
        test_target: np.array
            Vector of targets for test sample.
        node_params: dict, optional
            Event mapping describing which nodes or edges should be highlighted by different colors for better visualisation. Dictionary keys are ``event_col`` values, while keys have the following possible values:
                - ``bad_target``: highlights node and all incoming edges with red color;
                - ``nice_target``: highlights node and all incoming edges with green color;
                - ``bad_node``: highlights node with red color;
                - ``nice_node``: highlights node with green color;
                - ``source``: highlights node and all outgoing edges with yellow color.
            Example ``node_params`` is shown below:
            ```
            {
                'lost': 'bad_target',
                'purchased': 'nice_target',
                'onboarding_welcome_screen': 'source',
                'choose_login_type': 'nice_node',
                'accept_privacy_policy': 'bad_node',
            }
            ```
            If ``node_params=None``, it will be constructed from ``retention_config`` variable, so that:
            ```
            {
                'positive_target_event': 'nice_target',
                'negative_target_event': 'bad_target',
                'source_event': 'source',
            }
            ```
            Default: ``None``
        """
        self.show_quality_metrics(test_sample, test_target)
        if hasattr(self.mod, 'coef_'):
            self._plot_perm_imp(__LogRegWrapper__(self.mod.coef_[0]),
                                test_sample, node_params, **kwargs)
            return
        perm = PermutationImportance(self.mod, random_state=0).fit(
            test_sample, test_target)
        eli5.show_weights(perm,
                          feature_names=[
                              ' '.join(i) if type(i) == tuple else i
                              for i in test_sample.columns
                          ])
        self._plot_perm_imp(perm, test_sample, node_params, **kwargs)
def show_permutation_importance(model, val_X, val_y):
    '''
        Takes the model and dataframes for validation set (X and y)
        then calculates and shows the permutation importance.
        
        For more on permutation importance, check https://www.kaggle.com/dansbecker/permutation-importance
    '''
    import eli5
    from eli5.sklearn import PermutationImportance

    perm = PermutationImportance(model, random_state=1).fit(val_X, val_y)
    eli5.show_weights(perm, feature_names=val_X.columns.tolist())
def p_importances(permuter,feature_names):
    permutation_importances = eli5.show_weights(
        permuter,
        top=None,
        feature_names=feature_names
    )
    return permutation_importances
Пример #7
0
 def apply(self, data: 'Chapter') -> 'Chapter':
     base_score, score_decreases = get_score_importances(score_func, X, y)
     feature_importances = np.mean(score_decreases, axis=0)
     self.permutation_weights = show_weights(
         self.permutation_importances,
         feature_names=recipe.dataset.columns.keys())
     return data
Пример #8
0
def plot_score(clf, X_test, y_test, feat_to_show=30, is_normalize=False, cut_off=0.5):
    cm = confusion_matrix(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test)
    print ('ROC_AUC:  ', roc_auc_score(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test))
    print ('Gini:     ', 2*roc_auc_score(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test) - 1)
    print ('F1_score: ', f1_score(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test))
    print ('Log_loss: ', log_loss(clf.predict(X_test), y_test))
    
    print ('\n')
    print ('Classification_report: \n', classification_report(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test))
    skplt.metrics.plot_confusion_matrix(y_test, pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), title="Confusion Matrix",
                    normalize=is_normalize,figsize=(8,8),text_fontsize='large')
    #print ('\n')
    imp = pd.DataFrame(list(zip(X_test.columns, clf.feature_importances_)))
    imp = imp.reindex(imp[1].abs().sort_values().index).set_index(0)
    imp = imp[-feat_to_show:]
    #график_фич
    ax = imp.plot.barh(width = .6, legend = "", figsize = (12, 10))
    ax.set_title("Feature Importances", y = 1.03, fontsize = 16.)
    _ = ax.set(frame_on = False, xlabel = "", xticklabels = "", ylabel = "")
    for i, labl in enumerate(list(imp.index)):
        score = imp.loc[labl][1]
        ax.annotate('%.2f' % score, (score + (-.12 if score < 0 else .02), i - .2), fontsize = 10.5)
    try:
        display(eli5.show_weights(clf, top=20, feature_names = list(X_test.columns)))
    except:
        pass
def randFor(rData, lData):    
    randClass = RandomForestClassifier(n_estimators = 100)
    
    respTrain, respTest, labTrain, labTest = train_test_split(rData, lData, random_state=1)    
    
    vect = TfidfVectorizer(min_df=1, max_df=1.0, stop_words='english')        
    respTrainVec = vect.fit_transform(respTrain)    

    # To be commented for Pickle Building of Vectorizer
    respTestVec = vect.transform(respTest)
    
    randClass.fit(respTrainVec, labTrain)        
    # To be commented for Pickle Building of Rand Class Model
    labPredClass = randClass.predict(respTestVec)                                 
    
    #display(HTML(eli5.show_weights(randClass, top=5)))
    #print type(eli5.explain_prediction(randClass, respTest[0], vec=vect, target_names=targetList))
    #tDF = eli5.explain_prediction_df(randClass)
    #tDF1 = eli5.show_weights(randClass, vec=vect, target_names=targetList)
    #print type(eli5.show_prediction(randClass, respTest[0], vec=vect, target_names=targetList))     
    
    # Explain the Weights of this Estimator ----------------------------------
    #print eli5.explain_weights(randClass)
    print eli5.format_as_dataframes(eli5.show_weights(randClass))   
    print respTest[0]
    #prediction = eli5.explain_prediction (randClass, respTest[0], vec=vect, target_names=targetList, top=5)
    #weigths = eli5.explain_prediction (randClass, respTest[0], vec=vect, target_names=targetList, top=5)
    #print ( eli5.format_as_dataframes( weigths ) )
    
    # Modify to return specfic class types
    return (metrics.accuracy_score(labTest, labPredClass))
Пример #10
0
 def publish(self, recipe):
     base_score, score_decreases = get_score_importances(score_func, X, y)
     feature_importances = np.mean(score_decreases, axis=0)
     from eli5 import show_weights
     self.permutation_weights = show_weights(
             self.permutation_importances,
             feature_names = recipe.dataset.columns.keys())
     return self
Пример #11
0
def permutation_importance(X_train,
                           y_train,
                           X_test,
                           y_test,
                           model,
                           rand_state=1):

    # Lo que hace esta funcion es aplicar una tecnica para determinar la importancia de las variables
    # segun el siguiente metodo:
    # Dentro del conjunto de validacion (con un modelo ya entrenado) se mezclan una a una las variables (como en un mazo de cartas)
    # y se predicen los resultados y se evalua que tanto baja la performance. Cuanto mas baje, mas importante era esa variable

    perm = PermutationImportance(model, random_state=rand_state).fit(
        X_test, y_test
    )  # Creo una instancia PermutationImportance (solo se aplica a testing)
    eli5.show_weights(perm, feature_names=X_test.columns.tolist()
                      )  # Mostrar la importancia de las variables
Пример #12
0
def permutation_importance(my_model, val_X, val_y, ret_df=False):
    import eli5
    from eli5.sklearn import PermutationImportance
    perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
    if ret_df:
        return eli5.explain_weights_df(perm, feature_names=val_X.columns.tolist())
    else:
        return eli5.show_weights(perm, feature_names=val_X.columns.tolist())
Пример #13
0
def feature_importance(create_model):
    t_model = KerasClassifier(build_fn=create_model,
                              epochs=EPOCHS,
                              batch_size=5,
                              verbose=0)
    t_model.fit(X_train, Y_train)
    perm = PermutationImportance(t_model, random_state=1).fit(X_train, Y_train)
    display(eli5.show_weights(perm, feature_names=featureNames))
Пример #14
0
def load_and_feature_analysis():

    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
    import xgboost

    ################### eli5
    import eli5  # pip install eli5
    from eli5.sklearn import PermutationImportance

    #featrues_filename = 'features_3_sec.csv'       # Test Accuracy: 0.90224
    #featrues_filename = 'data_adv_3_sec_no_var_hccho.csv'  # 실수로 mfcc_var를 빼먹고 만들었다. Test Accuracy: 0.96663
    featrues_filename = 'data_adv_3_sec_hccho.csv'  # Test Accuracy : 0.95762

    data = pd.read_csv(f'{general_path}/{featrues_filename}')
    data = data.iloc[0:, 1:]  # 첫번째 column은 파일 이름이므로, 버린다.
    print(data.shape, data.head(5))

    y = data['label']  # genre variable.
    X = data.loc[:, data.columns !=
                 'label']  #select all columns but not the labels

    #### NORMALIZE X ####

    # Normalize so everything is on the same scale.

    cols = X.columns
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)  # return numpy array (9990,58)

    # new data frame with the new scaled data.
    X = pd.DataFrame(np_scaled, columns=cols)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42)  # Frame in Frame out

    ########## model load ##############
    xgb_classifier = pickle.load(open("my_xgb_model.pkl", "rb"))
    preds = xgb_classifier.predict(
        X_test)  # array(['hiphop', 'jazz', 'blues', ....],dtype=object)

    print('Accuracy', ':', round(accuracy_score(y_test, preds), 5), '\n')

    # feature F score. pandas dataframe으로 train했기 때문에, featrue이름이 표시된다. numpy array로 data를 넣었다면, feature이름이 표시되지 않는다.
    xgboost.plot_importance(xgb_classifier)
    plt.show()

    #######   eli5 PermutationImportance  #################
    perm = PermutationImportance(estimator=xgb_classifier, random_state=1)
    perm.fit(X_test, y_test)

    # return  되는 값은 정확도의 변화이다. 한번만 simulation하는 것이 아니므로, +/-가 있다.
    weights = eli5.show_weights(estimator=perm,
                                feature_names=X_test.columns.tolist()
                                )  #### weights.data가 string인데, 내용은 html형식.
    with open('Permutation_Importance.htm', 'wb') as f:
        f.write(weights.data.encode("UTF-8"))
def assess_best_features(cfg,
                         granularity,
                         label_col,
                         algorithm_code=ALGO_CODE_EXTRA_TREE):

    logging.debug(
        "Assess best features for %s, on %s level, using algorithm_code %s" %
        (granularity, label_col, algorithm_code))

    data_cfg = cfg.get_data_sample_config(label_col=label_col,
                                          max_items_per_class=None)
    # classifier_cfg = cfg.get_classifier_algorithm_config(algorithm_code)

    pipe = make_classifier_training_pipeline(
        cfg, get_classifier_algorithm_config(algorithm_code))

    #
    # prepare the data
    X = data_cfg.df
    y = data_cfg.df[label_col]

    #
    # fit the pipeline
    pipe.fit(X, y)

    # get the classifier instance
    forest_clf = pipe.named_steps["classifier"].named_steps[algorithm_code]

    # get feature union (that include tranformers)
    fe = pipe.named_steps["data_preprocessor"].named_steps["feature_union"]

    feature_names = _get_feature_names(fe)
    feature_importances = forest_clf.feature_importances_
    indices = np.argsort(feature_importances)[::-1]

    eli5_output = eli5.show_weights(forest_clf,
                                    feature_names=feature_names,
                                    top=100)
    # eli5_output = eli5.show_prediction(forest_clf, X.iloc[5], feature_names=feature_names, top=10, show_feature_values=True)
    from IPython.core.display import display
    display(eli5_output)

    # compute standard deviation
    std = np.std(
        [tree.feature_importances_ for tree in forest_clf.estimators_], axis=0)

    # # Print the feature ranking
    print("Feature ranking:")
    for i in range(100):
        print("%d. feature %d, %s, (%f, std:%f)" %
              (i + 1, indices[i], feature_names[indices[i]],
               feature_importances[indices[i]], std[indices[i]]))

    print("\nAverage standard deviation: %s" % np.average(std))

    return indices, feature_names, feature_importances, std
Пример #16
0
    def show_weights(self, **kwargs):
        """
        Call :func:`eli5.show_weights` for the locally-fit
        classification pipeline. Keyword arguments are passed
        to :func:`eli5.show_weights`.

        :func:`fit` must be called before using this method.
        """
        self._fix_target_names(kwargs)
        return eli5.show_weights(self.clf_, vec=self.vec_, **kwargs)
Пример #17
0
def test_show_weights():
    clf = LogisticRegression()
    X = [[0, 0], [1, 1], [0, 1]]
    y = ['a', 'b', 'a']
    clf.fit(X, y)

    html = eli5.show_weights(clf)
    # write_html(clf, html.data, '')
    assert isinstance(html, HTML)
    assert 'y=b' in html.data
    assert 'Explained as' not in html.data

    # explain_weights arguments are supported
    html = eli5.show_weights(clf, target_names=['A', 'B'])
    assert 'y=B' in html.data

    # format_as_html arguments are supported
    html = eli5.show_weights(clf, show=['method'])
    assert 'y=b' not in html.data
    assert 'Explained as' in html.data
Пример #18
0
 def explain_model(model, X_train):
     return show_weights(
         model,
         feature_names=X_train.columns.tolist(),
         show=(
             "method",
             "description",
             "transition_features",
             "targets",
             "feature_importances",
         ),
     )
def print_permutation_importance(fitted_model, data_model : tuple, random_state = 1):

	if len(data_model) == 4:
		X_train, X_test, y_train, y_test = data_model
	if len(data_model) == 2:
		X_test, y_test = data_model
	else:
		raise "the len of data model is neither 4 nor 2"

	feature_names = X_test.columns.tolist()
	perm = PermutationImportance(fitted_model, random_state = random_state).fit(X_test, y_test)
	display(eli5.show_weights(perm, feature_names = feature_names))
Пример #20
0
    def describe(self, model_name, model, data_dict):
        feature_names = list(data_dict['x_train'].columns)
        test_observation = data_dict['x_test'].iloc[0]
        explained_weights = eli5.show_weights(
            model,
            feature_names=feature_names,
            show=['targets', 'transition_features', 'feature_importances'])
        explained_prediction = eli5.show_prediction(model, test_observation)

        return {
            "Weights explanation": explained_weights,
            "Predictions explanation": explained_prediction
        }
Пример #21
0
def show_model_permutations(X_test=None, y_test=None, lin_model=None):
    """Appends permutation importance for the labels on one model to a list.
    Prints when called with no model.
    """
    if lin_model is None:
        for display_weights in weights:
            display(display_weights)
        return

    perm_importance = calc_model_perm_importance(X_test, y_test, lin_model)
    weights.append(
        eli5.show_weights(perm_importance,
                          feature_names=X_test.columns.tolist()))
Пример #22
0
def data_dictionary():
    st.title('What data do I need for risk estimation?')
    st.write(
        "Here is the table for data dictionary. You may refer to this table to understand variable meaning."
    )
    st.image(
        "https://github.com/yaliu0703/yaliu0703.github.io/blob/master/images/data%20dictionary.png?raw=true",
        use_column_width=True)
    html_object1 = eli5.show_weights(model,
                                     feature_names=list(X_train.columns),
                                     top=100)
    raw_html1 = html_object1._repr_html_()
    components.v1.html(raw_html1, height=500, scrolling=True)
Пример #23
0
def trainAndTestCRF():
	data = pd.read_csv(constants.NER_DATASET_DIR).fillna(method='ffill')
	getter = sg.SentenceGetter(data)
	sentences = getter.sentences
	X = [ft.sent2features(s) for s in sentences]
	y = [ft.sent2labels(s) for s in sentences]
	labels = ['O', 'PATH', 'DIS', 'SYMP']
	print(labels)
	crf = sklearn_crfsuite.CRF(
		algorithm='lbfgs',
		max_iterations=100,
		all_possible_transitions=False
	)
	params_space = {
		'c1': scipy.stats.expon(scale=0.5),
		'c2': scipy.stats.expon(scale=0.05),
	}
	# use the same metric for evaluation
	f1_scorer = make_scorer(metrics.flat_f1_score,
							average='weighted', labels=labels)
	# search
	rs = RandomizedSearchCV(crf, params_space,
							cv=3,
							verbose=1,
							n_jobs=-1,
							n_iter=50,
							scoring=f1_scorer)
	rs.fit(X, y)
	print('best params:', rs.best_params_)
	print('best CV score:', rs.best_score_)
	print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
	crf = rs.best_estimator_
	y_pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
	print(metrics.flat_classification_report(
		y, y_pred, labels=sorted(labels), digits=3
	))

	if not os.path.exists(constants.CRF_DIR):
		os.makedirs(constants.CRF_DIR)
	with open(os.path.join(constants.CRF_DIR, 'weights_optimized.html'), 'w+', encoding='utf-8') as f:
		f.write(eli5.show_weights(crf, top=30).data)

	testdata = pd.read_csv(constants.TESTDATA_DIR)
	getter = sg.SentenceGetter(testdata)
	sentences = getter.sentences
	X = [ft.sent2features(s) for s in sentences]
	y = [ft.sent2labels(s) for s in sentences]
	y_pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
	print(metrics.flat_classification_report(
		y, y_pred, labels=sorted(labels), digits=3
	))
Пример #24
0
    def predict(algo, zscoreX, zscoreY):
        # print algo, accuracy, r-squared
        print(algo)
        print('with zScore on X: ' + str(zscoreX) + ' and Y: ' + str(zscoreY))

        # different ZScore options
        if zscoreX:
            x = np.copy(scaled_x)
        else:
            x = np.copy(original_x)

        if zscoreY:
            y = np.copy(scaled_y)
        else:
            y = np.copy(original_y)

        model = algo()
        if algo == RandomForestRegressor:
            model = algo(n_estimators=150)
        model.fit(x[:dividing_line], y[:dividing_line])

        y_predict = model.predict(x[dividing_line:])

        # Alibi Explainable AI, only has Classifier support
        # we are going to use ELI5
        # ELI5 is typically used in a notebook, but we can export it as HTML

        swdoc = open('swdoc_' + grade + '.html', 'w')
        swdoc.write(show_weights(model).data)
        swdoc.close()
        spdoc = open('spdoc_' + grade + '.html', 'w')

        # we are remembering our prediction for every school, while it is in the test data
        myi = 0
        for school in y_predict:
            schools_to_predict[order_of_schools[myi]] = int(1000 *
                                                            float(school))
            myi += 1

            if myi % 60 == 0:
                # 1/10th chance of an individual prediction
                spdoc.write(
                    show_prediction(model,
                                    x[dividing_line + myi],
                                    show_feature_values=True).data)
        spdoc.close()

        # evaluation of test data
        print(explained_variance_score(y[dividing_line:], y_predict))
        print(r2_score(y[dividing_line:], y_predict))
Пример #25
0
 def eli5_perm_importance(self, **kwargs):
     """
     Return
     -----------------
     permutation importance implemented in ELI5
     """
     try:
         perm = PermutationImportance(self.model).fit(
             self.x_train, self.y_train)
         return eli5.show_weights(
             perm, feature_names=self.x_train.columns.tolist(), **kwargs)
     except AttributeError as err:
         err_logging(err)
         raise AttributeError(err)
Пример #26
0
    def permutation_importance(self, df_inputs, df_outputs, **kwargs):
        xx = self.interpret_preproc.transform(
            df_inputs[self.categorical_inputs + self.numeric_inputs])
        yy = df_outputs

        perm = eli5.sklearn.PermutationImportance(self.full_model,
                                                  **kwargs).fit(xx, yy)

        display(
            eli5.show_weights(perm,
                              feature_names=self.categorical_inputs +
                              self.numeric_inputs))

        return perm
Пример #27
0
def eli5_features(test, pipeline):

    clf = pipeline.named_steps["clf"]
    vec = pipeline.named_steps["vec"]
    transformer = Pipeline(pipeline.steps[:-1])

    with open("eli5_weights.html", "w") as f:
        f.write(eli5.show_weights(clf, vec=vec, top=50).data)

    with open("eli5_prediction.html", "w") as f:
        f.write(
            eli5.show_prediction(clf,
                                 transformer.transform(test),
                                 feature_names=vec.get_feature_names()).data)
def calculate_feature_importance(model, x_test, y_test):
    """
    :param model: pass the model to fitted model that you want to plot a learning curve for
    :param x_test: pass a dataframe the training data
    :param y_test: pass the training data target feature
    :return: a data frame containing features and their ranking of importance
    """
    start_time = datetime.now()

    model_name = type(model).__name__

    perm = PermutationImportance(model, random_state=2019).fit(x_test, y_test)
    time_elapsed = datetime.now() - start_time

    print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

    return eli5.show_weights(perm, feature_names=x_test.columns.tolist())
Пример #29
0
    def get_permutation_importance(self):
        # calculating permutation importance
        perm = PermutationImportance(self.model,
                                     random_state=1).fit(self.x, self.y)

        # saving Permutation Importance table
        html_str = eli5.show_weights(perm,
                                     feature_names=list(self.x.columns)).data
        html_file = open(self.out + "/permutation_importance.html", "w")
        html_file.write(html_str)
        html_file.close()

        # creating data frame for weights and features
        PI = pd.DataFrame(perm.feature_importances_, columns=["Weights"])
        PI["Features"] = list(self.x.columns)
        PI = PI.sort_values("Weights", ascending=False)
        return PI
Пример #30
0
def train_and_predict(model,
                      X_train,
                      y_train,
                      X_test,
                      site_feature_names=vectorizer.get_feature_names(),
                      new_feature_names=None,
                      cv=time_split,
                      scoring='roc_auc',
                      top_n_features_to_show=30,
                      submission_file_name='submission.csv'):

    cv_scores = cross_val_score(model,
                                X_train,
                                y_train,
                                cv=cv,
                                scoring=scoring,
                                n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)

    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names
    else:
        all_feature_names = site_feature_names

    display_html(
        eli5.show_weights(estimator=model,
                          feature_names=all_feature_names,
                          top=top_n_features_to_show))

    if new_feature_names:
        print('New feature weights:')

        print(
            pd.DataFrame({
                'feature':
                new_feature_names,
                'coef':
                model.coef_.flatten()[-len(new_feature_names):]
            }))

    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name)

    return cv_scores