class MyCalibrator: def __init__(self, base_estimator): self.base_estimator = base_estimator def fit(self, X, y): yp = self.predict(X) self.recalibration_mapper = LogisticAT(alpha=0).fit( yp.reshape(-1, 1), y) return self def predict(self, X, z=None): K = len(self.base_estimator.classes_) if z is None: yp = np.sum(self.base_estimator.predict_proba(X) * np.arange(K), axis=1) else: yp = np.sum(self.base_estimator.predict_proba(X, z=z) * np.arange(K), axis=1) return yp def predict_proba(self, X, z=None): yp = self.predict(X, z=z) yp2 = self.recalibration_mapper.predict_proba(yp.reshape(-1, 1)) return yp2
def general_explanation_using_skater(all_roles_scores, labels_training_set, labels_test_set, df_train_set, df_test_set, alpha): ''' Show the weight that more influenced a decision in eli 5 framework ---------------------------------------------------------------- Params: all_roles_score = list of all the marks present in test and train set for each role labels_training_set labels_test_set df_train_set df_test_set ''' le = preprocessing.LabelEncoder() le.fit(all_roles_scores) train_encoded_values = le.transform(labels_training_set) test_encoded_values = le.transform(labels_test_set) # boost_classifier = XGBClassifier(gamma = gamma, max_depth = maxde, min_child_weight = minchild) # boost_classifier.fit(df_train_set, train_encoded_values) # predictions = boost_classifier.predict(df_test_set) # predictions = predictions.astype('int') model_ordinal = LogisticAT(alpha=alpha) model_ordinal.fit(df_train_set.values, train_encoded_values) predictions = model_ordinal.predict(df_test_set) interpreter = Interpretation(df_train_set, feature_names=list(df_train_set.columns)) model = InMemoryModel(model_ordinal.predict_proba, examples=df_train_set[:10]) plots = interpreter.feature_importance.feature_importance(model, ascending=True) # fig, ax = plt.subplots(figsize=(5,35)) # plots = interpreter.feature_importance.plot_feature_importance(model, ascending=True, ax= ax) return plots
def model_train(features, target, model='mordcat'): if model == 'mordcat': model = LogisticAT() param_grid = { 'alpha': [0, 0.01, 0.05, 0.1, 0.3, 0.8, 1, 1.3], 'max_iter': [100] } elif model == 'catboost': model = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1') param_grid = {'l2_leaf_reg': [0.5, 1.0, 2.0, 3.0, 4.5, 5.0]} model_ordinal = GridSearchCV(model, cv=5, param_grid=param_grid, n_jobs=4) # model_ordinal = LogisticAT(alpha=0, max_iter=100) # alpha parameter set to zero to perform no regularisation model_ordinal.fit(features, target) print("Model trained") print(model_ordinal.best_params_) return model_ordinal
# normalize Xmean = Xtr.mean(axis=0) Xstd = Xtr.std(axis=0) Xtr = (Xtr - Xmean) / Xstd Xte = (Xte - Xmean) / Xstd # define model if family == 'binomial': model = LogisticRegression(penalty='l2', class_weight='balanced', random_state=random_state + 1, max_iter=1000) model_params = {'C': [0.1, 1, 10, 100]} metric = 'balanced_accuracy' elif family == 'ordinal': model = LogisticAT(max_iter=1000) model_params = {'alpha': [0.01, 0.1, 1, 10]} metric = 'balanced_accuracy' else: raise NotImplementedError(family) # fit model if params.get((X_type, y_type, cvi)) is None: model = GridSearchCV(model, model_params, scoring=metric, n_jobs=n_jobs, refit=True, cv=Ncv) model.fit(Xtr, ytr) params[(X_type, y_type, cvi)] = model.best_params_
def locals_explanation_using_shap(mode, all_score, labels_training_set, labels_test_set, a, train_set, test_set, position, integral_test_set): ''' :param mode: save or load, in order to access the already computed :param all_score: all the score from train set and test set :param labels_training_set: :param labels_test_set: :param a: alpha parameter for mord ordinal regression :param train_set: :param test_set: :paramn integral_test_set: test set without robust scaler application :return: shap explainer list of shap values list of predictions from test set (encoded) list of real prediction from test set (presents also intervals) list of motivation for each prediction ''' if (mode == 'save'): le = preprocessing.LabelEncoder() le.fit(all_score) train_encoded_values = le.transform(labels_training_set) test_encoded_values = le.transform(labels_test_set) model_ordinal = LogisticAT(alpha=a) model_ordinal.fit(train_set.values, train_encoded_values) predictions = model_ordinal.predict(test_set) real_predictions = le.inverse_transform(predictions) # explain all the predictions in the test set explainer = shap.KernelExplainer(model_ordinal.predict_proba, train_set) shap_values = explainer.shap_values(test_set) with open("mord_shap_values_" + position + "without_ratings.txt", "wb") as fp: pickle.dump(shap_values, fp) else: le = preprocessing.LabelEncoder() le.fit(all_score) train_encoded_values = le.transform(labels_training_set) test_encoded_values = le.transform(labels_test_set) model_ordinal = LogisticAT(alpha=a) model_ordinal.fit(train_set.values, train_encoded_values) predictions = model_ordinal.predict(test_set) real_predictions = le.inverse_transform(predictions) # explain all the predictions in the test set explainer = shap.KernelExplainer(model_ordinal.predict_proba, train_set) with open("mord_shap_values_" + position + "without_ratings.txt", "rb") as fp: shap_values = pickle.load(fp) list_of_explanation = [] for inde in range(0, len(predictions)): # extract predictions value importance_list = shap_values[predictions[inde]][inde, :] # extract the column index of positive increasing elements explanation = {} index = 0 for el in importance_list: if (el > 0): explanation[index] = el index += 1 exp = sorted(explanation.items(), key=lambda x: x[1], reverse=True) explanation = {} for el in exp: if (el[1] >= 0.01): explanation[el[0]] = el[1] newexp = {} for key in explanation.keys(): newexp[key] = train_set.columns[key] explanation = {} for key in newexp.keys(): explanation[newexp[key]] = integral_test_set.iloc[inde, key] list_of_explanation.append(explanation) return explainer, shap_values, predictions, real_predictions, list_of_explanation
def trainordinalregressor(df, listMarks, listRoles, path): """ Train a mord ordinal regression model Parameters ---------- df : the dataset of player features. listMarks: the name of the field of the marks we want to train the different model listRoles: list of roles name ['A', 'C', 'D', 'P'] path: where to store the models Returns ------- results dictionaries: distributionPerNewspaper: a dictionary that for each newspaper has the true vales and predicted values resultPerRole: a dictionary that syntetize for each newspaper for each roles some predictions metrics """ progress = 0 resultPerRole = {} distributionPerNewspaper = {} # for each newspaper for newspaper in listMarks: distributionPerNewspaper[newspaper] = {} distributionPerNewspaper[newspaper]['true'] = [] distributionPerNewspaper[newspaper]['pred'] = [] # for each role for role in listRoles: progress += 1 if (newspaper != 'fantacalcio_score'): subDF = df[df['player_role_newspaper'] == role] else: subDF = df[df['player_role_fantacalcio'] == role] # extract and transfrom categorical values le_teams = preprocessing.LabelEncoder() subDF['contextual_against_club_name'] = le_teams.fit_transform( subDF['contextual_against_club_name']) subDF['contextual_club_name'] = le_teams.transform( subDF['contextual_club_name']) le_country = preprocessing.LabelEncoder() subDF['country'] = le_country.fit_transform(subDF['country']) if (newspaper == 'corriere_score'): subDF = subDF[subDF['corriere_score'] != 10] if (newspaper == 'corriere_score' and role == 'D'): subDF = subDF[subDF['corriere_score'] != 8] subDF = subDF[subDF['corriere_score'] != 3.5] if (newspaper == 'corriere_score' and role == 'P'): subDF = subDF[subDF['corriere_score'] != 9] # check the size of the labels # vc = subDF[newspaper].value_counts() # indexes = vc[vc < n_min].index # subDF.drop(indexes, inplace=True) # ectract and encode labels le = preprocessing.LabelEncoder() labels = subDF[newspaper] le.fit(subDF[newspaper]) labels = le.transform(labels) myset = set(labels) for el in toRemove: del subDF[el] # uncomment to train without contextual variables # for el in toRemoveWithoutContextual: # del subDF[el] # uncomment to train without ratings variables for el in toRemoveRatings: del subDF[el] # uncommentforonly contextual variables # subDF = subDF[toRemoveWithoutContextual] stringMatch = newspaper + '_' + role resultPerRole[stringMatch] = {} # remove player rank values for goalkeeper # if(role == 'P'): # for rat in ratings: # del subDF[rat] print(stringMatch) # rescale the robust scaler robust = preprocessing.RobustScaler() robust.fit(subDF) subDF = robust.transform(subDF) # splitting X_train, X_test, y_train, y_test = train_test_split( subDF, labels, random_state=17) # declare ordinal regressor model_ordinal = LogisticAT( ) # alpha parameter set to zero to perform no regularisation seed = 17 # kfold definition kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed) features = subDF target = labels # rscore rscore = make_scorer(pearsonr_fun, greater_is_better=True) # OUR OBJECTIVE IS TO INCREASE THE R SCORE # define the grid search svr = GridSearchCV( model_ordinal, scoring=rscore, cv=kfold, param_grid={ 'alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] }, verbose=1) svr.fit(features, target) print("Best Score: {}".format(svr.best_score_)) print("Best params: {}".format(svr.best_params_)) resultPerRole[stringMatch]['r'] = svr.best_score_ model_ordinal = LogisticAT(alpha=svr.best_params_['alpha']) y_pred = cross_val_predict(model_ordinal, features, target, cv=kfold) resultPerRole[stringMatch]['RSME'] = math.sqrt( mean_squared_error(le.inverse_transform(y_pred), le.inverse_transform(target))) resultPerRole[stringMatch]['Accuracy'] = acc_fun(target, y_pred) resultPerRole[stringMatch]['KS'] = ks_fun(target, y_pred, le) resultPerRole[stringMatch]['r2'] = r2_fun(target, y_pred) le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_))) print(le_name_mapping) print(Counter(target).keys()) print(Counter(target).values()) path_inserted = path + 'mord_' + role + '.joblib' dump(model_ordinal, path_inserted) distributionPerNewspaper[newspaper]['true'].append( le.inverse_transform(target)) distributionPerNewspaper[newspaper]['pred'].append( le.inverse_transform(y_pred)) return distributionPerNewspaper, resultPerRole
idx_train, idx_test = train_test_split( df_gender.reset_index()['index'], test_size=test_size, stratify=y_age, random_state=seed) pd.DataFrame(idx_train).to_csv( os.path.join('Results_' + str(seed), 'train_' + sex + '.csv')) pd.DataFrame(idx_test).to_csv( os.path.join('Results_' + str(seed), 'test_' + sex + '.csv')) # ordinal logistic regression fit model_ordinal = LogisticAT(alpha=0) df_gender_train = df_gender.loc[idx_train] model_ordinal.fit(df_gender_train[['age']].astype(int), df_gender_train['grading'].astype(int)) df_overall.loc[df_gender.index, 'ordered_LR_prediction'] = model_ordinal.predict( df_gender[['age']]) # compute delta grading df_overall['delta_grading_olr'] = df_overall['grading'] - df_overall[ 'ordered_LR_prediction']
Y = Y - 1 n_test = int(len(df) / 10) Y_train = Y[n_test:] Y_test = Y[:n_test] X = df[[ 'LineFitGeoSplit1Params.n_hits', 'SplineMPEDirectHitsICB.n_early_strings', 'SplineMPEDirectHitsICB.n_late_doms', 'SPEFitSingleTimeSplit1.azimuth', 'ProjectedQ.max_grad_radius_circ_F', 'ProjectedQ.ratio', 'BestTrackCramerRaoParams.cramer_rao_theta', 'BestTrackCramerRaoParams.variance_theta', 'BestTrackCramerRaoParams.variance_x', 'BestTrackCramerRaoParams.variance_y', 'BestTrackCramerRaoParams.covariance_theta_y', 'SplineMPETruncatedEnergy_SPICEMie_DOMS_Muon.energy', 'SplineMPETruncatedEnergy_SPICEMie_BINS_Muon.energy', 'SPEFit2TimeSplit1BayesianFitParams.nmini', 'LineFitTimeSplit2Params.n_hits', 'BestTrackDirectHitsICB.n_dir_pulses', 'HitStatisticsValues.min_pulse_time', 'SplineMPEDirectHitsICE.n_dir_doms', 'SplineMPEDirectHitsICE.n_late_strings', 'MPEFit_HVFitParams.nmini' ]] #'SplineMPECharacteristicsIC.avg_dom_dist_q_tot_dom', #'MPEFitHighNoiseFitParams.nmini']] X_box = power_transform(X, method='yeo-johnson') X_btrain = X_box[n_test:] #splitting the dataframe X_btest = X_box[:n_test] estimator = LogisticAT() selector = RFE(estimator, n_features_to_select=5, step=1) selector.fit(X_box, Y) print(selector.ranking_)
unscore = [] score = [] for i in jokes: if i['score']: score.append(i) else: unscore.append(i) return unscore, score model_linear = LinearRegression() model_1vR = LogisticRegression(multi_class='ovr', class_weight='balanced') model_multi = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced') model_ordinal = LogisticAT(alpha=0) jokes = organize(data)[1] target = [i['score'] for i in jokes] jokes = [i['joke'] for i in jokes] tokenizer = TreebankWordTokenizer() feats, fea_to_idx = pl.get_features(jokes, tokenizer) mtrx = pl.create_mtrx(jokes, feats, fea_to_idx, tokenizer) MAE = make_scorer(mean_absolute_error) folds = 5 # print(mtrx) print('Mean absolute error: ') MAE_linear = cross_val_score(model_linear, mtrx, target, cv=folds, scoring=MAE)
# Bin Tail Output Values and Shift lower_bound = 3 upper_bound = 24 y_train_replaced_1 = y_train.where(y_train >= lower_bound, lower_bound) y_train_replaced_2 = y_train_replaced_1.where(y_train <= upper_bound, upper_bound) y_train_shifted = y_train_replaced_2 - (lower_bound - 1) #%% ### Model Estimation ### for clf in [LogisticAT(), LogisticIT()]: model_str = str(clf)[:10] if model_str not in Models: Models += [model_str] y_predict_shifted = clf.fit(X_train, y_train_shifted).predict(X_test) y_predict = y_predict_shifted + (lower_bound - 1) Results, Measures = performance_summary(y_predict, y_test, conf=True, conf_label='Output/' + model_str) Summary += Results
# -*- coding: utf-8 -*- """ Created on Sun May 10 20:25:17 2020 @author: HO18971 """ from mord import LogisticAT from utilities import load_task, plot_olr import pandas as pd df_task = load_task('phenotype.csv') # CHANGE THE NAME OF YOUR PHENOTYPE FILE model_ordinal_m = LogisticAT(alpha=0) df_task_original_m = df_task[df_task['gender'] == 0] model_ordinal_m.fit(df_task_original_m[['age']].astype(int), df_task_original_m['grading'].astype(int)) y_pred_m = model_ordinal_m.predict(df_task_original_m[['age']]) df_task.loc[df_task_original_m.index, 'ordered_LR_prediction'] = y_pred_m model_ordinal_f = LogisticAT(alpha=0) df_task_original_f = df_task[df_task['gender'] == 1] model_ordinal_f.fit(df_task_original_f[['age']].astype(int), df_task_original_f['grading'].astype(int)) y_pred_f = model_ordinal_f.predict(df_task_original_f[['age']]) df_task.loc[df_task_original_f.index, 'ordered_LR_prediction'] = y_pred_f thresholds_m = model_ordinal_m.theta_ / model_ordinal_m.coef_ thresholds_f = model_ordinal_f.theta_ / model_ordinal_f.coef_ df_threshold = pd.DataFrame(
# svc from sklearn import svm clf = svm.SVC() clf.fit(X_train, y_train) clf_pre_svm = clf.predict(X_test) # svr from sklearn import svm clf = svm.SVR() clf.fit(X_train, y_train) clf_pre_svr = clf.predict(X_test) # Threshold model from mord import LogisticAT logit = LogisticAT() logit.fit(X_train, y_train) clf_pre_LogisticAT = logit.predict(X_test) # Threshold model from mord import LogisticIT logit = LogisticIT() logit.fit(X_train, y_train) clf_pre_LogisticIT = logit.predict(X_test) # regression ordianl from mord import OrdinalRidge clf = OrdinalRidge() clf.fit(X_train, y_train) clf_pre_OrdinalRidge = clf.predict(X_test)
outcomes = ['Deceased3month', 'DeceasedDisch', 'GOSDisch'] family = ['binomial', 'binomial', 'ordinal'] models = {} for i, outcome in enumerate(outcomes): if family[i] == 'binomial': model_ = LogisticRegression(penalty='none', class_weight='balanced', random_state=random_state, max_iter=1000) model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values) model_ = CalibratedClassifierCV(base_estimator=model_, method='sigmoid', cv='prefit') model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values) elif family[i] == 'ordinal': model_ = LogisticAT(alpha=0) model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values) model_ = MyCalibrator(model_) model_.fit(df.VECAMS.values.reshape(-1, 1), df[outcome].values) models[outcome] = model_ #intercept = model.base_estimator.estimator.intercept_[0] scores = model.base_estimator.estimator.coef_[0].astype(int) unique_scores = set() scores = scores[scores > 0] for k in range(0, len(scores) + 1): for score_comb in combinations(scores, k): unique_scores.add(sum(score_comb)) unique_scores = sorted(unique_scores) print(f'unique_scores = {unique_scores}')
def train(m, x_train, y_train, x_test, y_test): print('training', m) model = [] pred_var = {} if m == 'LAD': from mord import LAD lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) y_train = y_train.astype(float).round() y_train = y_train.astype(int) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LAD grid search best parameters: {}".format( model.best_params_)) elif m == 'MCLog': # this class is not avaialble from sklearn.linear_model import LogisticRegression mcl = LogisticRegression(multi_class='multinomial', max_iter=10000, solver='newton-cg', fit_intercept=True) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(mcl, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] MCLog grid search best parameters: {}".format( model.best_params_)) elif m == 'LogAT': # takes quite some time from mord import LogisticAT lat = LogisticAT() params = {"alpha": np.linspace(0, 1, 5)} model = GridSearchCV(lat, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LogAT grid search best parameters: {}".format( model.best_params_)) elif m == 'LinearSVC': from sklearn.svm import LinearSVC svm = LinearSVC() params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] LinearSVC grid search best parameters: {}".format( model.best_params_)) elif m == 'RFC': from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() params = {"n_estimators": [10, 100, 500, 1000]} model = GridSearchCV(rfc, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFC grid search best parameters: {}".format( model.best_params_)) elif m == 'Lasso': from sklearn.linear_model import Lasso from sklearn.linear_model import LassoCV svm = Lasso() params = {"alpha": [10]} model = GridSearchCV(svm, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) # model = LassoCV(n_alphas=10, cv=5, verbose=3) # model.fit(x_train, y_train) # print("[INFO] Lasso path search best parameter: {}".format(model.alpha_)) elif m == 'RFR': from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) print("[INFO] RFR grid search best parameters: {}".format( model.best_params_)) elif m == 'RR': from sklearn.linear_model import Ridge, RidgeCV ridge = Ridge() params = { 'alpha': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] } model = GridSearchCV(ridge, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) print("[INFO] Ridge Regression grid search best parameters: {}".format( model.best_params_)) # model = RidgeCV(alphas=(0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), cv=5) # model.fit(x_train, y_train) # print("[INFO] Ridge Regression grid search best parameters: {}".format(model.alpha_)) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) elif m == 'PLSR': from sklearn.cross_decomposition import PLSRegression pls_reg = PLSRegression() params = { 'n_components': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ] } model = GridSearchCV(pls_reg, param_grid=params, cv=5, verbose=0) # pdb.set_trace() model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) print("[INFO] PLS Regression grid search best parameters: {}".format( model.best_params_)) pred_var = predict(m, model, x_test, y_test) elif m == 'RVM': from skrvm import RVR print('in RVM') model = RVR(kernel='linear') # avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss='mse', # num_rounds=3, random_seed=123) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) # print('Average expected loss: %.3f' % avg_expected_loss) # print('Average bias: %.3f' % avg_bias) # print('Average variance: %.3f' % avg_var) elif m == 'DTR': from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() # params = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2], # "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100]} # params = {"max_depth": [2,4,6]} # model = GridSearchCV(dtr, param_grid=params, cv=5, verbose=0) model.fit(x_train, y_train) train_var = predict(m, model, x_train, y_train) pred_var = predict(m, model, x_test, y_test) elif m == 'COMB': from sklearn.ensemble import RandomForestRegressor from mord import LAD from group_pred import create_age_groups print('IN COMB') group_lad = dict() print('shapes', x_train.shape, y_train.shape) lad1 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} broad_lad = GridSearchCV(lad1, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) y_train_r = y_train.astype(float).round() y_train_r = y_train_r.astype(int) broad_lad.fit(x_train, y_train_r) age_group_all = create_age_groups(y_train_r, 10, 5) for ages in age_group_all: # print('ages', ages) idx_grp = list() for item in ages: # for every age in the age group collect the training data by getting the indices for idx, val in enumerate(y_train_r): if val == item: idx_grp.append(idx) print('group info', ages, len(idx_grp)) if len(idx_grp) > 5: key_age_grp = str(np.min(ages)) + '_' + str(np.max(ages)) x_samples_train = x_train[idx_grp] y_samples_train = y_train_r[idx_grp] # print('y_samples_train', y_samples_train) lad2 = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) params2 = {"C": [0.001, 0.01, 1, 10, 100, 1000]} specific_lad = GridSearchCV(lad2, param_grid=params2, cv=5, scoring='neg_mean_absolute_error', verbose=0) specific_lad.fit(x_samples_train, y_samples_train) group_lad[key_age_grp] = specific_lad print('len_groups', len(group_lad)) pred_all = make_predictions(x_train, broad_lad, group_lad) rfr = RandomForestRegressor(criterion='mse') params = {"n_estimators": [500]} model_2 = GridSearchCV(rfr, param_grid=params, cv=5, verbose=0) model_2.fit(pred_all, y_train) # lad = LAD(epsilon=0.0, tol=0.0001, loss='epsilon_insensitive', fit_intercept=True, # intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=10000) # params = {"C": [0.001, 0.01, 1, 10, 100, 1000]} # model_2 = GridSearchCV(lad, param_grid=params, cv=5, scoring='neg_mean_absolute_error', verbose=0) # model_2.fit(pred_all, y_train_r) train_var = predict(m, model_2, pred_all, y_train) print("[INFO] RFR grid search best parameters: {}".format( model_2.best_params_)) pred_all_test = make_predictions(x_test, broad_lad, group_lad) pred_var = predict(m, model_2, pred_all_test, y_test) model = [broad_lad, group_lad, model_2] else: print('unknown model') if m == 'RVM' or 'DTR': return model, 0, 0, pred_var, train_var elif m == 'COMB': return model, model_2.best_score_, model_2.best_params_, pred_var, train_var else: return model, model.best_score_, model.best_params_, pred_var, train_var
from sklearn.metrics import accuracy_score #loading dataset wvs = pd.read_csv( "C:/Datasets_BA/360DigiTMG/DS_India/360DigiTMG DS India Module wise PPTs/Module 10b Ordinal Logistic Regression/wvs.csv" ) wvs.head() # EDA wvs.describe() wvs.columns #converting into binary lb = LabelEncoder() wvs["poverty"] = lb.fit_transform(wvs["poverty"]) wvs["religion"] = lb.fit_transform(wvs["religion"]) wvs["degree"] = lb.fit_transform(wvs["degree"]) wvs["country"] = lb.fit_transform(wvs["country"]) wvs["gender"] = lb.fit_transform(wvs["gender"]) from mord import LogisticAT model = LogisticAT(alpha=0).fit(wvs.iloc[:, 1:], wvs.iloc[:, 0]) # alpha parameter set to zero to perform no regularisation.fit(x_train,y_train) model.coef_ model.classes_ predict = model.predict(wvs.iloc[:, 1:]) # Train predictions # Accuracy accuracy_score(wvs.iloc[:, 0], predict)
return df.drop(depVar, axis=1).iloc[:, columns] return df.drop(depVar, axis=1) from sklearn.linear_model import LinearRegression, LogisticRegression from mord import LogisticAT, LogisticIT from sklearn import preprocessing # instantiate models model_linear = LinearRegression() model_1vR = LogisticRegression(multi_class='ovr', class_weight='balanced') model_multi = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced', max_iter=760) model_ordinal_IT = LogisticIT() model_ordinal = LogisticAT(alpha=0) # alpha parameter set to zero to perform no regularisation models = [model_linear, model_1vR, model_multi, model_ordinal, model_ordinal_IT] models_str = ["Linear Regression", "Logistic Regression (one vs. rest)", "Logistic Regression (multinomial)", "Ordered Logistic Regression AT", "Ordered Logistic Regression IT"] # instantiate preprocessing tools scaler = preprocessing.StandardScaler() def trim_correlated(df_in, threshold, dependent_var): df_corr = df_in.corr(method='pearson', min_periods=1) df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any() un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index df_out = df_in[un_corr_idx] print("Uncorrelated independent variables:") print(*df_out.columns, sep=', ') return df_out.join(dependent_var)
def grid_search(evaluation, features, labels, penalty_weights, algorithm, num_jobs, **options): """ expects the features to be scaled for svm and knn. """ # Set the parameters for gid search and model based on algorithm choice if algorithm == 'kernel-svm': tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'decision_function_shape': ['ovo', 'ovr'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }, { 'kernel': ['sigmoid'], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'decision_function_shape': ['ovo', 'ovr'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }, { 'kernel': ['poly'], 'degree': [2, 3], 'decision_function_shape': ['ovo', 'ovr'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] model = svm.SVC(tol=0.05, cache_size=6000, class_weight=penalty_weights) elif algorithm == 'linear-svm': tuned_parameters = [{ 'loss': ['hinge', 'squared_hinge'], 'multi_class': ['ovr'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] model = svm.LinearSVC(tol=0.05, max_iter=5000, class_weight=penalty_weights) elif algorithm == 'logistic': # newton, lbfgs only support L2 costs_list = (10.0**numpy.arange(-6, 5)).tolist() tuned_parameters = [{ 'multi_class': ['ovr'], 'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': costs_list }, { 'multi_class': ['multinomial'], 'solver': ['lbfgs'], 'penalty': ['l2'], 'C': costs_list }] model = LogisticRegression(tol=0.005, max_iter=5000, class_weight=penalty_weights) elif algorithm == 'sgd-logistic': alphas_list = (10.0**numpy.arange(-8, 1)).tolist() tuned_parameters = [{ 'penalty': ['l1', 'l2'], 'alpha': alphas_list }, { 'penalty': ['elasticnet'], 'alpha': alphas_list, 'l1_ratio': [0.005, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6] }] # loss should be log for logistic classifier. We don't set n_jobs since grid search # will use the cores n_iter = numpy.ceil(5 * (10**6) / features.shape[0]) model = SGDClassifier(loss='log', class_weight=penalty_weights, n_iter=n_iter, n_jobs=1) elif algorithm == 'sgd-svm': alphas_list = (10.0**numpy.arange(-8, 1)).tolist() tuned_parameters = [{ 'penalty': ['l1', 'l2'], 'alpha': alphas_list }, { 'penalty': ['elasticnet'], 'alpha': alphas_list, 'l1_ratio': [0.005, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6] }] # loss should be hinge for linear svm classifier. We don't set n_jobs since grid # search will use the cores n_iter = numpy.ceil(5 * (10**6) / features.shape[0]) model = SGDClassifier(loss='hinge', class_weight=penalty_weights, n_iter=n_iter, n_jobs=1) elif algorithm == 'random-forest': tuned_parameters = [{ 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 0.4, 0.8], 'min_samples_split': [2], 'min_samples_leaf': [1] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 0.4, 0.8], 'min_samples_split': [5], 'min_samples_leaf': [1, 2] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 0.4, 0.8], 'min_samples_split': [10], 'min_samples_leaf': [2, 5] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 0.4, 0.8], 'min_samples_split': [20], 'min_samples_leaf': [5, 10] }, { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 0.4, 0.8], 'min_samples_split': [50], 'min_samples_leaf': [5, 15, 25] }] model = RandomForestClassifier(class_weight=penalty_weights) elif algorithm == 'knn': tuned_parameters = [{ 'n_neighbors': [1, 2, 3, 4, 5, 10, 15, 20, 30, 50, 70, 100, 150, 200], 'metric': ['euclidean', 'manhattan', 'chebyshev'], 'algorithm': ['ball_tree', 'kd_tree'], 'weights': ['uniform', 'distance'] }] model = KNeighborsClassifier() elif algorithm == 'ridgeclassifier': alphas_list = (10.0**numpy.arange(-5, 5)).tolist() tuned_parameters = [{'alpha': alphas_list, 'normalize': [True, False]}] model = RidgeClassifier(max_iter=10000, class_weight=penalty_weights) elif algorithm == 'logisticse': alphas_list = (10.0**numpy.arange(-5, 5)).tolist() tuned_parameters = [{'alpha': alphas_list}] model = LogisticSE(max_iter=10000) elif algorithm == 'logisticit': alphas_list = (10.0**numpy.arange(-5, 5)).tolist() tuned_parameters = [{'alpha': alphas_list}] model = LogisticIT(max_iter=10000) elif algorithm == 'logisticat': alphas_list = (10.0**numpy.arange(-5, 5)).tolist() tuned_parameters = [{'alpha': alphas_list}] model = LogisticAT(max_iter=10000) elif algorithm == 'ordinalridge': alphas_list = (10.0**numpy.arange(-5, 5)).tolist() tuned_parameters = [{'alpha': alphas_list}] model = OrdinalRidge(max_iter=10000) elif algorithm == 'lad': tuned_parameters = [{ 'loss': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] model = LAD(max_iter=3000) else: sys.exit('Invalid algorithm: ' + algorithm + ' provided') scorer = utils.create_scorer(evaluation) skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True) # Don't pre dispatch all jobs at once, only dispatch ones you are runnings so memory # usage does not blow up clf = GridSearchCV(estimator=model, param_grid=tuned_parameters, n_jobs=num_jobs, pre_dispatch="n_jobs", cv=skf, scoring=scorer) clf.fit(features, labels) print "Best Grid Search Parameters are: " + str(clf.best_params_) print "Best Grid Search CV Score: " + str(clf.best_score_) return clf
def fit(self, X, y): yp = self.predict(X) self.recalibration_mapper = LogisticAT(alpha=0).fit( yp.reshape(-1, 1), y) return self
# Get mapping from labels to classes [ print('{} is Column: {}'.format(item, num)) for num, item in enumerate(encoder.classes_) ] train_data = train_data.drop(train_data.columns[0], axis=1) test_data = test_data.drop(test_data.columns[0], axis=1) #CHECK SHAPE print("shape: ", test_data.shape, train_data.shape) #train_y = np.reshape(train_y.values,(-1,4)) #TRAIN MODELS #DNN # model = tflearn.DNN(network, tensorboard_verbose=0) #model.fit(train_data.values, train_y, show_metric = True, batch_size=10) #oc oc1 = LogisticAT() oc2 = LogisticIT(alpha=0.1) oc3 = LAD() #oc = GradientBoostingClassifier(max_depth=3,n_estimators=350, learning_rate = 0.05,subsample=0.9, max_leaf_nodes=30000) oc1.fit(train_data.values, train_y_oc) oc2.fit(train_data.values, train_y_oc) oc3.fit(train_data.values, train_y_oc) #PREDICT predictions_oc1 = oc1.predict(test_data.values) predictions_oc2 = oc2.predict(test_data.values) predictions_oc3 = oc3.predict(test_data.values) #predictions_dnn = model.predict(test_data.values) #predictions_dnn = [item for sublist in predictions_dnn for item in sublist] #avg #predictions = np.mean([predictions_oc, predictions_dnn], axis = 0)