Exemplo n.º 1
0
class MyCalibrator:
    def __init__(self, base_estimator):
        self.base_estimator = base_estimator

    def fit(self, X, y):
        yp = self.predict(X)
        self.recalibration_mapper = LogisticAT(alpha=0).fit(
            yp.reshape(-1, 1), y)
        return self

    def predict(self, X, z=None):
        K = len(self.base_estimator.classes_)
        if z is None:
            yp = np.sum(self.base_estimator.predict_proba(X) * np.arange(K),
                        axis=1)
        else:
            yp = np.sum(self.base_estimator.predict_proba(X, z=z) *
                        np.arange(K),
                        axis=1)
        return yp

    def predict_proba(self, X, z=None):
        yp = self.predict(X, z=z)
        yp2 = self.recalibration_mapper.predict_proba(yp.reshape(-1, 1))
        return yp2
def general_explanation_using_skater(all_roles_scores, labels_training_set,
                                     labels_test_set, df_train_set,
                                     df_test_set, alpha):
    '''
    Show the weight that more influenced a decision in eli 5 framework

    ----------------------------------------------------------------
    Params:
        all_roles_score = list of all the marks present in test and train set for each role
        labels_training_set
        labels_test_set
        df_train_set
        df_test_set

    '''
    le = preprocessing.LabelEncoder()
    le.fit(all_roles_scores)
    train_encoded_values = le.transform(labels_training_set)
    test_encoded_values = le.transform(labels_test_set)

    # boost_classifier = XGBClassifier(gamma = gamma, max_depth = maxde, min_child_weight = minchild)
    # boost_classifier.fit(df_train_set, train_encoded_values)

    # predictions = boost_classifier.predict(df_test_set)
    # predictions = predictions.astype('int')

    model_ordinal = LogisticAT(alpha=alpha)
    model_ordinal.fit(df_train_set.values, train_encoded_values)
    predictions = model_ordinal.predict(df_test_set)

    interpreter = Interpretation(df_train_set,
                                 feature_names=list(df_train_set.columns))

    model = InMemoryModel(model_ordinal.predict_proba,
                          examples=df_train_set[:10])

    plots = interpreter.feature_importance.feature_importance(model,
                                                              ascending=True)

    # fig, ax = plt.subplots(figsize=(5,35))
    # plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True, ax= ax)

    return plots
Exemplo n.º 3
0
def model_train(features, target, model='mordcat'):
    if model == 'mordcat':
        model = LogisticAT()
        param_grid = {
            'alpha': [0, 0.01, 0.05, 0.1, 0.3, 0.8, 1, 1.3],
            'max_iter': [100]
        }
    elif model == 'catboost':
        model = CatBoostClassifier(loss_function='MultiClass',
                                   eval_metric='TotalF1')
        param_grid = {'l2_leaf_reg': [0.5, 1.0, 2.0, 3.0, 4.5, 5.0]}

    model_ordinal = GridSearchCV(model, cv=5, param_grid=param_grid, n_jobs=4)
    # model_ordinal = LogisticAT(alpha=0, max_iter=100)  # alpha parameter set to zero to perform no regularisation
    model_ordinal.fit(features, target)
    print("Model trained")
    print(model_ordinal.best_params_)

    return model_ordinal
Exemplo n.º 4
0
                # normalize
                Xmean = Xtr.mean(axis=0)
                Xstd = Xtr.std(axis=0)
                Xtr = (Xtr - Xmean) / Xstd
                Xte = (Xte - Xmean) / Xstd

                # define model
                if family == 'binomial':
                    model = LogisticRegression(penalty='l2',
                                               class_weight='balanced',
                                               random_state=random_state + 1,
                                               max_iter=1000)
                    model_params = {'C': [0.1, 1, 10, 100]}
                    metric = 'balanced_accuracy'
                elif family == 'ordinal':
                    model = LogisticAT(max_iter=1000)
                    model_params = {'alpha': [0.01, 0.1, 1, 10]}
                    metric = 'balanced_accuracy'
                else:
                    raise NotImplementedError(family)

                # fit model
                if params.get((X_type, y_type, cvi)) is None:
                    model = GridSearchCV(model,
                                         model_params,
                                         scoring=metric,
                                         n_jobs=n_jobs,
                                         refit=True,
                                         cv=Ncv)
                    model.fit(Xtr, ytr)
                    params[(X_type, y_type, cvi)] = model.best_params_
def locals_explanation_using_shap(mode, all_score, labels_training_set,
                                  labels_test_set, a, train_set, test_set,
                                  position, integral_test_set):
    '''

    :param mode: save or load, in order to access the already computed
    :param all_score: all the score from train set and test set
    :param labels_training_set:
    :param labels_test_set:
    :param a: alpha parameter for mord ordinal regression
    :param train_set:
    :param test_set:
    :paramn integral_test_set: test set without robust scaler application
    :return:
            shap explainer
            list of shap values
            list of predictions from test set (encoded)
            list of real prediction from test set (presents also intervals)
            list of motivation for each prediction
    '''
    if (mode == 'save'):
        le = preprocessing.LabelEncoder()
        le.fit(all_score)
        train_encoded_values = le.transform(labels_training_set)
        test_encoded_values = le.transform(labels_test_set)

        model_ordinal = LogisticAT(alpha=a)
        model_ordinal.fit(train_set.values, train_encoded_values)
        predictions = model_ordinal.predict(test_set)
        real_predictions = le.inverse_transform(predictions)

        # explain all the predictions in the test set
        explainer = shap.KernelExplainer(model_ordinal.predict_proba,
                                         train_set)

        shap_values = explainer.shap_values(test_set)

        with open("mord_shap_values_" + position + "without_ratings.txt",
                  "wb") as fp:
            pickle.dump(shap_values, fp)
    else:
        le = preprocessing.LabelEncoder()
        le.fit(all_score)
        train_encoded_values = le.transform(labels_training_set)
        test_encoded_values = le.transform(labels_test_set)

        model_ordinal = LogisticAT(alpha=a)
        model_ordinal.fit(train_set.values, train_encoded_values)
        predictions = model_ordinal.predict(test_set)
        real_predictions = le.inverse_transform(predictions)

        # explain all the predictions in the test set
        explainer = shap.KernelExplainer(model_ordinal.predict_proba,
                                         train_set)

        with open("mord_shap_values_" + position + "without_ratings.txt",
                  "rb") as fp:
            shap_values = pickle.load(fp)

    list_of_explanation = []
    for inde in range(0, len(predictions)):
        # extract predictions value
        importance_list = shap_values[predictions[inde]][inde, :]

        # extract the column index of positive increasing elements
        explanation = {}
        index = 0
        for el in importance_list:
            if (el > 0):
                explanation[index] = el
            index += 1
        exp = sorted(explanation.items(), key=lambda x: x[1], reverse=True)

        explanation = {}
        for el in exp:
            if (el[1] >= 0.01):
                explanation[el[0]] = el[1]
        newexp = {}
        for key in explanation.keys():
            newexp[key] = train_set.columns[key]

        explanation = {}
        for key in newexp.keys():
            explanation[newexp[key]] = integral_test_set.iloc[inde, key]
        list_of_explanation.append(explanation)

    return explainer, shap_values, predictions, real_predictions, list_of_explanation
def trainordinalregressor(df, listMarks, listRoles, path):
    """
    Train a mord ordinal regression model

    Parameters
    ----------
    df : the dataset of player features.
    listMarks: the name of the field of the marks we want to train the different model
    listRoles: list of roles name ['A', 'C', 'D', 'P']
    path: where to store the models

    Returns
    -------
    results dictionaries:
        distributionPerNewspaper: a dictionary that for each newspaper has the true vales and predicted values
        resultPerRole: a dictionary that syntetize for each newspaper for each roles some predictions metrics
    """
    progress = 0
    resultPerRole = {}
    distributionPerNewspaper = {}
    # for each newspaper
    for newspaper in listMarks:
        distributionPerNewspaper[newspaper] = {}
        distributionPerNewspaper[newspaper]['true'] = []
        distributionPerNewspaper[newspaper]['pred'] = []
        # for each role
        for role in listRoles:
            progress += 1
            if (newspaper != 'fantacalcio_score'):
                subDF = df[df['player_role_newspaper'] == role]
            else:
                subDF = df[df['player_role_fantacalcio'] == role]
            # extract and transfrom categorical values
            le_teams = preprocessing.LabelEncoder()
            subDF['contextual_against_club_name'] = le_teams.fit_transform(
                subDF['contextual_against_club_name'])
            subDF['contextual_club_name'] = le_teams.transform(
                subDF['contextual_club_name'])
            le_country = preprocessing.LabelEncoder()
            subDF['country'] = le_country.fit_transform(subDF['country'])

            if (newspaper == 'corriere_score'):
                subDF = subDF[subDF['corriere_score'] != 10]
            if (newspaper == 'corriere_score' and role == 'D'):
                subDF = subDF[subDF['corriere_score'] != 8]
                subDF = subDF[subDF['corriere_score'] != 3.5]
            if (newspaper == 'corriere_score' and role == 'P'):
                subDF = subDF[subDF['corriere_score'] != 9]

            # check the size of the labels
            # vc = subDF[newspaper].value_counts()
            # indexes = vc[vc < n_min].index
            # subDF.drop(indexes, inplace=True)

            # ectract and encode labels
            le = preprocessing.LabelEncoder()
            labels = subDF[newspaper]
            le.fit(subDF[newspaper])
            labels = le.transform(labels)
            myset = set(labels)

            for el in toRemove:
                del subDF[el]

            # uncomment to train without contextual variables
            # for el in toRemoveWithoutContextual:
            # del subDF[el]

            # uncomment to train without ratings variables
            for el in toRemoveRatings:
                del subDF[el]

            # uncommentforonly contextual variables
            # subDF = subDF[toRemoveWithoutContextual]

            stringMatch = newspaper + '_' + role

            resultPerRole[stringMatch] = {}

            # remove player rank values for goalkeeper
            # if(role == 'P'):
            #   for rat in ratings:
            #        del subDF[rat]
            print(stringMatch)

            # rescale the robust scaler
            robust = preprocessing.RobustScaler()
            robust.fit(subDF)
            subDF = robust.transform(subDF)

            # splitting
            X_train, X_test, y_train, y_test = train_test_split(
                subDF, labels, random_state=17)

            # declare ordinal regressor
            model_ordinal = LogisticAT(
            )  # alpha parameter set to zero to perform no regularisation
            seed = 17

            # kfold definition
            kfold = StratifiedKFold(n_splits=2,
                                    shuffle=True,
                                    random_state=seed)
            features = subDF
            target = labels
            # rscore
            rscore = make_scorer(pearsonr_fun, greater_is_better=True)
            # OUR OBJECTIVE IS TO INCREASE THE R SCORE
            # define the grid search
            svr = GridSearchCV(
                model_ordinal,
                scoring=rscore,
                cv=kfold,
                param_grid={
                    'alpha':
                    [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
                },
                verbose=1)

            svr.fit(features, target)

            print("Best Score: {}".format(svr.best_score_))
            print("Best params: {}".format(svr.best_params_))

            resultPerRole[stringMatch]['r'] = svr.best_score_

            model_ordinal = LogisticAT(alpha=svr.best_params_['alpha'])

            y_pred = cross_val_predict(model_ordinal,
                                       features,
                                       target,
                                       cv=kfold)

            resultPerRole[stringMatch]['RSME'] = math.sqrt(
                mean_squared_error(le.inverse_transform(y_pred),
                                   le.inverse_transform(target)))
            resultPerRole[stringMatch]['Accuracy'] = acc_fun(target, y_pred)
            resultPerRole[stringMatch]['KS'] = ks_fun(target, y_pred, le)
            resultPerRole[stringMatch]['r2'] = r2_fun(target, y_pred)

            le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            print(le_name_mapping)

            print(Counter(target).keys())
            print(Counter(target).values())

            path_inserted = path + 'mord_' + role + '.joblib'
            dump(model_ordinal, path_inserted)

            distributionPerNewspaper[newspaper]['true'].append(
                le.inverse_transform(target))
            distributionPerNewspaper[newspaper]['pred'].append(
                le.inverse_transform(y_pred))
    return distributionPerNewspaper, resultPerRole
Exemplo n.º 7
0
        idx_train, idx_test = train_test_split(
            df_gender.reset_index()['index'],
            test_size=test_size,
            stratify=y_age,
            random_state=seed)

        pd.DataFrame(idx_train).to_csv(
            os.path.join('Results_' + str(seed), 'train_' + sex + '.csv'))

        pd.DataFrame(idx_test).to_csv(
            os.path.join('Results_' + str(seed), 'test_' + sex + '.csv'))

        # ordinal logistic regression fit

        model_ordinal = LogisticAT(alpha=0)

        df_gender_train = df_gender.loc[idx_train]

        model_ordinal.fit(df_gender_train[['age']].astype(int),
                          df_gender_train['grading'].astype(int))

        df_overall.loc[df_gender.index,
                       'ordered_LR_prediction'] = model_ordinal.predict(
                           df_gender[['age']])

    # compute delta grading

    df_overall['delta_grading_olr'] = df_overall['grading'] - df_overall[
        'ordered_LR_prediction']
Exemplo n.º 8
0
Y = Y - 1

n_test = int(len(df) / 10)
Y_train = Y[n_test:]
Y_test = Y[:n_test]
X = df[[
    'LineFitGeoSplit1Params.n_hits', 'SplineMPEDirectHitsICB.n_early_strings',
    'SplineMPEDirectHitsICB.n_late_doms', 'SPEFitSingleTimeSplit1.azimuth',
    'ProjectedQ.max_grad_radius_circ_F', 'ProjectedQ.ratio',
    'BestTrackCramerRaoParams.cramer_rao_theta',
    'BestTrackCramerRaoParams.variance_theta',
    'BestTrackCramerRaoParams.variance_x',
    'BestTrackCramerRaoParams.variance_y',
    'BestTrackCramerRaoParams.covariance_theta_y',
    'SplineMPETruncatedEnergy_SPICEMie_DOMS_Muon.energy',
    'SplineMPETruncatedEnergy_SPICEMie_BINS_Muon.energy',
    'SPEFit2TimeSplit1BayesianFitParams.nmini',
    'LineFitTimeSplit2Params.n_hits', 'BestTrackDirectHitsICB.n_dir_pulses',
    'HitStatisticsValues.min_pulse_time', 'SplineMPEDirectHitsICE.n_dir_doms',
    'SplineMPEDirectHitsICE.n_late_strings', 'MPEFit_HVFitParams.nmini'
]]
#'SplineMPECharacteristicsIC.avg_dom_dist_q_tot_dom',
#'MPEFitHighNoiseFitParams.nmini']]
X_box = power_transform(X, method='yeo-johnson')
X_btrain = X_box[n_test:]  #splitting the dataframe
X_btest = X_box[:n_test]
estimator = LogisticAT()
selector = RFE(estimator, n_features_to_select=5, step=1)
selector.fit(X_box, Y)
print(selector.ranking_)
Exemplo n.º 9
0
    unscore = []
    score = []
    for i in jokes:
        if i['score']:
            score.append(i)
        else:
            unscore.append(i)
    return unscore, score


model_linear = LinearRegression()
model_1vR = LogisticRegression(multi_class='ovr', class_weight='balanced')
model_multi = LogisticRegression(multi_class='multinomial',
                                 solver='lbfgs',
                                 class_weight='balanced')
model_ordinal = LogisticAT(alpha=0)

jokes = organize(data)[1]
target = [i['score'] for i in jokes]
jokes = [i['joke'] for i in jokes]

tokenizer = TreebankWordTokenizer()
feats, fea_to_idx = pl.get_features(jokes, tokenizer)
mtrx = pl.create_mtrx(jokes, feats, fea_to_idx, tokenizer)

MAE = make_scorer(mean_absolute_error)
folds = 5

# print(mtrx)
print('Mean absolute error: ')
MAE_linear = cross_val_score(model_linear, mtrx, target, cv=folds, scoring=MAE)
Exemplo n.º 10
0
    # Bin Tail Output Values and Shift

    lower_bound = 3
    upper_bound = 24

    y_train_replaced_1 = y_train.where(y_train >= lower_bound, lower_bound)
    y_train_replaced_2 = y_train_replaced_1.where(y_train <= upper_bound,
                                                  upper_bound)

    y_train_shifted = y_train_replaced_2 - (lower_bound - 1)

    #%%

    ### Model Estimation ###

    for clf in [LogisticAT(), LogisticIT()]:

        model_str = str(clf)[:10]

        if model_str not in Models:
            Models += [model_str]

        y_predict_shifted = clf.fit(X_train, y_train_shifted).predict(X_test)
        y_predict = y_predict_shifted + (lower_bound - 1)

        Results, Measures = performance_summary(y_predict,
                                                y_test,
                                                conf=True,
                                                conf_label='Output/' +
                                                model_str)
        Summary += Results
# -*- coding: utf-8 -*-
"""
Created on Sun May 10 20:25:17 2020

@author: HO18971
"""

from mord import LogisticAT
from utilities import load_task, plot_olr
import pandas as pd

df_task = load_task('phenotype.csv')  # CHANGE THE NAME OF YOUR PHENOTYPE FILE

model_ordinal_m = LogisticAT(alpha=0)
df_task_original_m = df_task[df_task['gender'] == 0]
model_ordinal_m.fit(df_task_original_m[['age']].astype(int),
                    df_task_original_m['grading'].astype(int))
y_pred_m = model_ordinal_m.predict(df_task_original_m[['age']])
df_task.loc[df_task_original_m.index, 'ordered_LR_prediction'] = y_pred_m

model_ordinal_f = LogisticAT(alpha=0)
df_task_original_f = df_task[df_task['gender'] == 1]
model_ordinal_f.fit(df_task_original_f[['age']].astype(int),
                    df_task_original_f['grading'].astype(int))
y_pred_f = model_ordinal_f.predict(df_task_original_f[['age']])
df_task.loc[df_task_original_f.index, 'ordered_LR_prediction'] = y_pred_f

thresholds_m = model_ordinal_m.theta_ / model_ordinal_m.coef_
thresholds_f = model_ordinal_f.theta_ / model_ordinal_f.coef_

df_threshold = pd.DataFrame(
Exemplo n.º 12
0
# svc
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
clf_pre_svm = clf.predict(X_test)

# svr
from sklearn import svm
clf = svm.SVR()
clf.fit(X_train, y_train)
clf_pre_svr = clf.predict(X_test)

# Threshold model
from mord import LogisticAT
logit = LogisticAT()
logit.fit(X_train, y_train)
clf_pre_LogisticAT = logit.predict(X_test)

# Threshold model
from mord import LogisticIT
logit = LogisticIT()
logit.fit(X_train, y_train)
clf_pre_LogisticIT = logit.predict(X_test)

# regression ordianl
from mord import OrdinalRidge
clf = OrdinalRidge()
clf.fit(X_train, y_train)
clf_pre_OrdinalRidge = clf.predict(X_test)
Exemplo n.º 13
0
    outcomes = ['Deceased3month', 'DeceasedDisch', 'GOSDisch']
    family = ['binomial', 'binomial', 'ordinal']
    models = {}
    for i, outcome in enumerate(outcomes):
        if family[i] == 'binomial':
            model_ = LogisticRegression(penalty='none',
                                        class_weight='balanced',
                                        random_state=random_state,
                                        max_iter=1000)
            model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values)
            model_ = CalibratedClassifierCV(base_estimator=model_,
                                            method='sigmoid',
                                            cv='prefit')
            model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values)
        elif family[i] == 'ordinal':
            model_ = LogisticAT(alpha=0)
            model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values)
            model_ = MyCalibrator(model_)
            model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values)
        models[outcome] = model_

    #intercept = model.base_estimator.estimator.intercept_[0]
    scores = model.base_estimator.estimator.coef_[0].astype(int)
    unique_scores = set()
    scores = scores[scores > 0]
    for k in range(0, len(scores) + 1):
        for score_comb in combinations(scores, k):
            unique_scores.add(sum(score_comb))
    unique_scores = sorted(unique_scores)
    print(f'unique_scores = {unique_scores}')
Exemplo n.º 14
0
def train(m, x_train, y_train, x_test, y_test):
    print('training', m)
    model = []
    pred_var = {}

    if m == 'LAD':
        from mord import LAD
        lad = LAD(epsilon=0.0,
                  tol=0.0001,
                  loss='epsilon_insensitive',
                  fit_intercept=True,
                  intercept_scaling=1.0,
                  dual=True,
                  verbose=0,
                  random_state=None,
                  max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(lad,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)

        y_train = y_train.astype(float).round()
        y_train = y_train.astype(int)

        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LAD grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'MCLog':  # this class is not avaialble
        from sklearn.linear_model import LogisticRegression
        mcl = LogisticRegression(multi_class='multinomial',
                                 max_iter=10000,
                                 solver='newton-cg',
                                 fit_intercept=True)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(mcl,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] MCLog grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'LogAT':  # takes quite some time
        from mord import LogisticAT
        lat = LogisticAT()
        params = {"alpha": np.linspace(0, 1, 5)}
        model = GridSearchCV(lat,
                             param_grid=params,
                             cv=5,
                             scoring='neg_mean_absolute_error',
                             verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LogAT grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'LinearSVC':
        from sklearn.svm import LinearSVC
        svm = LinearSVC()
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] LinearSVC grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'RFC':
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        params = {"n_estimators": [10, 100, 500, 1000]}
        model = GridSearchCV(rfc, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFC grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'Lasso':
        from sklearn.linear_model import Lasso
        from sklearn.linear_model import LassoCV
        svm = Lasso()
        params = {"alpha": [10]}
        model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))
        # model = LassoCV(n_alphas=10, cv=5, verbose=3)
        # model.fit(x_train, y_train)
        # print("[INFO] Lasso path search best parameter: {}".format(model.alpha_))

    elif m == 'RFR':
        from sklearn.ensemble import RandomForestRegressor
        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)
        print("[INFO] RFR grid search best parameters: {}".format(
            model.best_params_))

    elif m == 'RR':
        from sklearn.linear_model import Ridge, RidgeCV
        ridge = Ridge()
        params = {
            'alpha':
            [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        }
        model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0)
        model.fit(x_train, y_train)
        print("[INFO] Ridge Regression grid search best parameters: {}".format(
            model.best_params_))
        # model = RidgeCV(alphas=(0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), cv=5)
        # model.fit(x_train, y_train)
        # print("[INFO] Ridge Regression grid search best parameters: {}".format(model.alpha_))
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'PLSR':
        from sklearn.cross_decomposition import PLSRegression
        pls_reg = PLSRegression()
        params = {
            'n_components': [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
                19, 20
            ]
        }
        model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0)
        # pdb.set_trace()
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        print("[INFO] PLS Regression grid search best parameters: {}".format(
            model.best_params_))
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'RVM':
        from skrvm import RVR
        print('in RVM')
        model = RVR(kernel='linear')
        # avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss='mse',
        #                                                             num_rounds=3, random_seed=123)
        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

        # print('Average expected loss: %.3f' % avg_expected_loss)
        # print('Average bias: %.3f' % avg_bias)
        # print('Average variance: %.3f' % avg_var)

    elif m == 'DTR':
        from sklearn.tree import DecisionTreeRegressor
        model = DecisionTreeRegressor()
        # params = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2],
        #           "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100]}
        # params = {"max_depth": [2,4,6]}
        # model = GridSearchCV(dtr, param_grid=params, cv=5, verbose=0)

        model.fit(x_train, y_train)
        train_var = predict(m, model, x_train, y_train)
        pred_var = predict(m, model, x_test, y_test)

    elif m == 'COMB':
        from sklearn.ensemble import RandomForestRegressor
        from mord import LAD
        from group_pred import create_age_groups
        print('IN COMB')
        group_lad = dict()

        print('shapes', x_train.shape, y_train.shape)

        lad1 = LAD(epsilon=0.0,
                   tol=0.0001,
                   loss='epsilon_insensitive',
                   fit_intercept=True,
                   intercept_scaling=1.0,
                   dual=True,
                   verbose=0,
                   random_state=None,
                   max_iter=10000)
        params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        broad_lad = GridSearchCV(lad1,
                                 param_grid=params,
                                 cv=5,
                                 scoring='neg_mean_absolute_error',
                                 verbose=0)

        y_train_r = y_train.astype(float).round()
        y_train_r = y_train_r.astype(int)

        broad_lad.fit(x_train, y_train_r)

        age_group_all = create_age_groups(y_train_r, 10, 5)

        for ages in age_group_all:
            # print('ages', ages)
            idx_grp = list()
            for item in ages:  # for every age in the age group collect the training data by getting the indices
                for idx, val in enumerate(y_train_r):
                    if val == item:
                        idx_grp.append(idx)

            print('group info', ages, len(idx_grp))
            if len(idx_grp) > 5:
                key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages))
                x_samples_train = x_train[idx_grp]
                y_samples_train = y_train_r[idx_grp]
                # print('y_samples_train', y_samples_train)

                lad2 = LAD(epsilon=0.0,
                           tol=0.0001,
                           loss='epsilon_insensitive',
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           dual=True,
                           verbose=0,
                           random_state=None,
                           max_iter=10000)
                params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
                specific_lad = GridSearchCV(lad2,
                                            param_grid=params2,
                                            cv=5,
                                            scoring='neg_mean_absolute_error',
                                            verbose=0)
                specific_lad.fit(x_samples_train, y_samples_train)
                group_lad[key_age_grp] = specific_lad

        print('len_groups', len(group_lad))
        pred_all = make_predictions(x_train, broad_lad, group_lad)

        rfr = RandomForestRegressor(criterion='mse')
        params = {"n_estimators": [500]}
        model_2 = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0)
        model_2.fit(pred_all, y_train)

        # lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True,
        #            intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000)
        # params = {"C": [0.001, 0.01, 1, 10, 100, 1000]}
        # model_2 = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0)
        # model_2.fit(pred_all, y_train_r)

        train_var = predict(m, model_2, pred_all, y_train)
        print("[INFO] RFR grid search best parameters: {}".format(
            model_2.best_params_))

        pred_all_test = make_predictions(x_test, broad_lad, group_lad)
        pred_var = predict(m, model_2, pred_all_test, y_test)
        model = [broad_lad, group_lad, model_2]
    else:
        print('unknown model')

    if m == 'RVM' or 'DTR':
        return model, 0, 0, pred_var, train_var
    elif m == 'COMB':
        return model, model_2.best_score_, model_2.best_params_, pred_var, train_var
    else:
        return model, model.best_score_, model.best_params_, pred_var, train_var
from sklearn.metrics import accuracy_score

#loading dataset
wvs = pd.read_csv(
    "C:/Datasets_BA/360DigiTMG/DS_India/360DigiTMG DS India Module wise PPTs/Module 10b Ordinal Logistic Regression/wvs.csv"
)
wvs.head()

# EDA
wvs.describe()
wvs.columns

#converting into binary
lb = LabelEncoder()
wvs["poverty"] = lb.fit_transform(wvs["poverty"])
wvs["religion"] = lb.fit_transform(wvs["religion"])
wvs["degree"] = lb.fit_transform(wvs["degree"])
wvs["country"] = lb.fit_transform(wvs["country"])
wvs["gender"] = lb.fit_transform(wvs["gender"])

from mord import LogisticAT
model = LogisticAT(alpha=0).fit(wvs.iloc[:, 1:], wvs.iloc[:, 0])
# alpha parameter set to zero to perform no regularisation.fit(x_train,y_train)
model.coef_
model.classes_

predict = model.predict(wvs.iloc[:, 1:])  # Train predictions

# Accuracy
accuracy_score(wvs.iloc[:, 0], predict)
Exemplo n.º 16
0
        return df.drop(depVar, axis=1).iloc[:, columns]
    return df.drop(depVar, axis=1)

from sklearn.linear_model import LinearRegression, LogisticRegression
from mord import LogisticAT, LogisticIT
from sklearn import preprocessing

# instantiate models
model_linear = LinearRegression()
model_1vR = LogisticRegression(multi_class='ovr',
    class_weight='balanced')
model_multi = LogisticRegression(multi_class='multinomial',
    solver='lbfgs',
    class_weight='balanced', max_iter=760)
model_ordinal_IT = LogisticIT()
model_ordinal = LogisticAT(alpha=0)  # alpha parameter set to zero to perform no regularisation
models = [model_linear, model_1vR, model_multi, model_ordinal, model_ordinal_IT]
models_str = ["Linear Regression", "Logistic Regression (one vs. rest)",
              "Logistic Regression (multinomial)", "Ordered Logistic Regression AT", "Ordered Logistic Regression IT"]

# instantiate preprocessing tools
scaler = preprocessing.StandardScaler()

def trim_correlated(df_in, threshold, dependent_var):
    df_corr = df_in.corr(method='pearson', min_periods=1)
    df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
    un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
    df_out = df_in[un_corr_idx]
    print("Uncorrelated independent variables:")
    print(*df_out.columns, sep=', ')
    return df_out.join(dependent_var)
Exemplo n.º 17
0
def grid_search(evaluation, features, labels, penalty_weights, algorithm,
                num_jobs, **options):
    """
  expects the features to be scaled for svm and knn.
  """
    # Set the parameters for gid search and model based on algorithm choice
    if algorithm == 'kernel-svm':
        tuned_parameters = [{
            'kernel': ['rbf'],
            'gamma': [0.1, 0.01, 0.001, 0.0001],
            'decision_function_shape': ['ovo', 'ovr'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }, {
            'kernel': ['sigmoid'],
            'gamma': [0.1, 0.01, 0.001, 0.0001],
            'decision_function_shape': ['ovo', 'ovr'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }, {
            'kernel': ['poly'],
            'degree': [2, 3],
            'decision_function_shape': ['ovo', 'ovr'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }]
        model = svm.SVC(tol=0.05,
                        cache_size=6000,
                        class_weight=penalty_weights)

    elif algorithm == 'linear-svm':
        tuned_parameters = [{
            'loss': ['hinge', 'squared_hinge'],
            'multi_class': ['ovr'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }]
        model = svm.LinearSVC(tol=0.05,
                              max_iter=5000,
                              class_weight=penalty_weights)

    elif algorithm == 'logistic':
        # newton, lbfgs only support L2
        costs_list = (10.0**numpy.arange(-6, 5)).tolist()
        tuned_parameters = [{
            'multi_class': ['ovr'],
            'solver': ['liblinear'],
            'penalty': ['l1', 'l2'],
            'C': costs_list
        }, {
            'multi_class': ['multinomial'],
            'solver': ['lbfgs'],
            'penalty': ['l2'],
            'C': costs_list
        }]
        model = LogisticRegression(tol=0.005,
                                   max_iter=5000,
                                   class_weight=penalty_weights)

    elif algorithm == 'sgd-logistic':
        alphas_list = (10.0**numpy.arange(-8, 1)).tolist()
        tuned_parameters = [{
            'penalty': ['l1', 'l2'],
            'alpha': alphas_list
        }, {
            'penalty': ['elasticnet'],
            'alpha': alphas_list,
            'l1_ratio': [0.005, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6]
        }]
        # loss should be log for logistic classifier. We don't set n_jobs since grid search
        # will use the cores
        n_iter = numpy.ceil(5 * (10**6) / features.shape[0])
        model = SGDClassifier(loss='log',
                              class_weight=penalty_weights,
                              n_iter=n_iter,
                              n_jobs=1)

    elif algorithm == 'sgd-svm':
        alphas_list = (10.0**numpy.arange(-8, 1)).tolist()
        tuned_parameters = [{
            'penalty': ['l1', 'l2'],
            'alpha': alphas_list
        }, {
            'penalty': ['elasticnet'],
            'alpha': alphas_list,
            'l1_ratio': [0.005, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6]
        }]
        # loss should be hinge for linear svm classifier. We don't set n_jobs since grid
        # search will use the cores
        n_iter = numpy.ceil(5 * (10**6) / features.shape[0])
        model = SGDClassifier(loss='hinge',
                              class_weight=penalty_weights,
                              n_iter=n_iter,
                              n_jobs=1)

    elif algorithm == 'random-forest':
        tuned_parameters = [{
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 0.4, 0.8],
            'min_samples_split': [2],
            'min_samples_leaf': [1]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 0.4, 0.8],
            'min_samples_split': [5],
            'min_samples_leaf': [1, 2]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 0.4, 0.8],
            'min_samples_split': [10],
            'min_samples_leaf': [2, 5]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 0.4, 0.8],
            'min_samples_split': [20],
            'min_samples_leaf': [5, 10]
        }, {
            'n_estimators': [100],
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 0.4, 0.8],
            'min_samples_split': [50],
            'min_samples_leaf': [5, 15, 25]
        }]
        model = RandomForestClassifier(class_weight=penalty_weights)

    elif algorithm == 'knn':
        tuned_parameters = [{
            'n_neighbors':
            [1, 2, 3, 4, 5, 10, 15, 20, 30, 50, 70, 100, 150, 200],
            'metric': ['euclidean', 'manhattan', 'chebyshev'],
            'algorithm': ['ball_tree', 'kd_tree'],
            'weights': ['uniform', 'distance']
        }]
        model = KNeighborsClassifier()

    elif algorithm == 'ridgeclassifier':
        alphas_list = (10.0**numpy.arange(-5, 5)).tolist()
        tuned_parameters = [{'alpha': alphas_list, 'normalize': [True, False]}]
        model = RidgeClassifier(max_iter=10000, class_weight=penalty_weights)

    elif algorithm == 'logisticse':
        alphas_list = (10.0**numpy.arange(-5, 5)).tolist()
        tuned_parameters = [{'alpha': alphas_list}]
        model = LogisticSE(max_iter=10000)

    elif algorithm == 'logisticit':
        alphas_list = (10.0**numpy.arange(-5, 5)).tolist()
        tuned_parameters = [{'alpha': alphas_list}]
        model = LogisticIT(max_iter=10000)

    elif algorithm == 'logisticat':
        alphas_list = (10.0**numpy.arange(-5, 5)).tolist()
        tuned_parameters = [{'alpha': alphas_list}]
        model = LogisticAT(max_iter=10000)

    elif algorithm == 'ordinalridge':
        alphas_list = (10.0**numpy.arange(-5, 5)).tolist()
        tuned_parameters = [{'alpha': alphas_list}]
        model = OrdinalRidge(max_iter=10000)

    elif algorithm == 'lad':
        tuned_parameters = [{
            'loss': ['l1', 'l2'],
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
        }]
        model = LAD(max_iter=3000)

    else:
        sys.exit('Invalid algorithm: ' + algorithm + ' provided')

    scorer = utils.create_scorer(evaluation)
    skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True)
    # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory
    # usage does not blow up
    clf = GridSearchCV(estimator=model,
                       param_grid=tuned_parameters,
                       n_jobs=num_jobs,
                       pre_dispatch="n_jobs",
                       cv=skf,
                       scoring=scorer)

    clf.fit(features, labels)
    print "Best Grid Search Parameters are: " + str(clf.best_params_)
    print "Best Grid Search CV Score: " + str(clf.best_score_)

    return clf
Exemplo n.º 18
0
 def fit(self, X, y):
     yp = self.predict(X)
     self.recalibration_mapper = LogisticAT(alpha=0).fit(
         yp.reshape(-1, 1), y)
     return self
Exemplo n.º 19
0
        # Get mapping from labels to classes
        [
            print('{} is Column: {}'.format(item, num))
            for num, item in enumerate(encoder.classes_)
        ]
        train_data = train_data.drop(train_data.columns[0], axis=1)
        test_data = test_data.drop(test_data.columns[0], axis=1)
        #CHECK SHAPE
        print("shape: ", test_data.shape, train_data.shape)
        #train_y = np.reshape(train_y.values,(-1,4))
        #TRAIN MODELS
        #DNN
        # model =  tflearn.DNN(network, tensorboard_verbose=0)
        #model.fit(train_data.values, train_y,  show_metric = True, batch_size=10)
        #oc
        oc1 = LogisticAT()
        oc2 = LogisticIT(alpha=0.1)
        oc3 = LAD()
        #oc = GradientBoostingClassifier(max_depth=3,n_estimators=350, learning_rate = 0.05,subsample=0.9, max_leaf_nodes=30000)
        oc1.fit(train_data.values, train_y_oc)
        oc2.fit(train_data.values, train_y_oc)
        oc3.fit(train_data.values, train_y_oc)
        #PREDICT
        predictions_oc1 = oc1.predict(test_data.values)
        predictions_oc2 = oc2.predict(test_data.values)
        predictions_oc3 = oc3.predict(test_data.values)

        #predictions_dnn = model.predict(test_data.values)
        #predictions_dnn = [item for sublist in predictions_dnn for item in sublist]
        #avg
        #predictions = np.mean([predictions_oc, predictions_dnn], axis = 0)