def setTrainDataAndMakeModel(X_train,Y_train,X_test): clf = MultinomialNB(alpha=125535, class_prior=None, fit_prior=True) calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5) calibrated_clf.fit(X_train, Y_train) ypreds = calibrated_clf.predict_proba(X_test) return ypreds
def move_bias(self, data_matrix, estimator=None, nu=.5, cv=2): ''' move bias until nu of data_matrix are in the negative class then use scikits calibrate to calibrate self.estimator around the input ''' # move bias # l = [(estimator.decision_function(g)[0], g) for g in data_matrix] # l.sort(key=lambda x: x[0]) # element = int(len(l) * nu) # estimator.intercept_ -= l[element][0] scores = [estimator.decision_function(sparse_vector)[0] for sparse_vector in data_matrix] scores_sorted = sorted(scores) pivot = scores_sorted[int(len(scores_sorted) * self.nu)] estimator.intercept_ -= pivot # calibrate if self.move_bias_recalibrate: # data_matrix_binary = vstack([a[1] for a in l]) # data_y = numpy.asarray([0] * element + [1] * (len(l) - element)) data_y = numpy.asarray([1 if score >= pivot else -1 for score in scores]) self.testimator = SGDClassifier(loss='log') self.testimator.fit(data_matrix, data_y) # estimator = CalibratedClassifierCV(estimator, cv=cv, method='sigmoid') estimator = CalibratedClassifierCV(self.testimator, cv=cv, method='sigmoid') estimator.fit(data_matrix, data_y) return estimator
def simple_model(data, test): targets = data.target X, tX, y, ty = train_test_split(data.drop("target", axis=1), targets, test_size=0.2, random_state=2016) predictions = [] print("\n\nTraining") # Sklearn GBM clf = RandomForestClassifier(n_estimators=2500, max_depth=2, random_state=2015) cal = CalibratedClassifierCV(clf, cv=5, method="isotonic") cal.fit(X,y) pred = cal.predict_proba(tX)[:,1] print("\n\tValidation for Calibrated RFC") print("\t", log_loss(ty, pred)) print("\t", roc_auc_score(ty, pred)) # ens["gbm"] = pred predictions.append(cal.predict_proba(test)[:,1]) predictions = sum(predictions)/len(predictions) return predictions
def calibrate_probs(y_val, prob_val, prob_test, n_folds=2, method='isotonic', random_state=5968): """ Calling from R: suppressMessages(library("rPython")) # Load RPython python.load("path/to/util_rpython.py") data.pred.calib <- python.call('calibrate_probs', y_val=y_val, # Actual values from validation prob_val=pred_val, # Predicted values from validation prob_test=pred_test) # Predicted values from test # data.pred.calib will be a list, so to get the calibrated predictions for each value we do: calib_pred_val = data.pred.calib$val calib_pred_test = data.pred.calib$test """ y_val = np.asarray(y_val, dtype=float) prob_val = np.asarray(prob_val, dtype=float).reshape((-1, 1)) prob_test = np.asarray(prob_test, dtype=float).reshape((-1, 1)) prob_clb_val = np.zeros(len(y_val)) prob_clb_test = np.zeros(len(prob_test)) kf_val_full = KFold(len(y_val), n_folds=n_folds, random_state=random_state) for ix_train, ix_test in kf_val_full: kf_val_inner = KFold(len(ix_train), n_folds=n_folds, random_state=random_state) clf = CalibratedClassifierCV(method=method, cv=kf_val_inner) clf.fit(prob_val[ix_train], y_val[ix_train]) prob_clb_val[ix_test] = clf.predict_proba(prob_val[ix_test])[:, 1] prob_clb_test += clf.predict_proba(prob_test)[:, 1]/n_folds return {'val': list(prob_clb_val), 'test': list(prob_clb_test)}
def setTrainTestDataAndCheckModel(X_train,Y_train,X_test,Y_test): model = RandomForestClassifier(125) model.fit(X_train,Y_train) ''' clf = GridSearchCV(model,{'n_estimators':[100,125,150]},verbose=1) clf.fit(X_train,Y_train) print(clf.best_score_) print(clf.best_params_) output = model.predict(X_test) print "-------------------RFC-----------------------" #print accuracy_score(Y_test,output) #print "%.2f" % log_loss(Y_test,output, eps=1e-15, normalize=True) ypreds = model.predict_proba(X_test) print "%.2f" % log_loss(Y_test,ypreds, eps=1e-15, normalize=True) clfbag = BaggingClassifier(model, n_estimators=5) clfbag.fit(X_train, Y_train) ypreds = clfbag.predict(X_test) #print accuracy_score(Y_test,ypreds) ypreds = clfbag.predict_proba(X_test) print "%.2f" % log_loss(Y_test,ypreds, eps=1e-15, normalize=True) ''' calibrated_clf = CalibratedClassifierCV(model, method='isotonic', cv=5) calibrated_clf.fit(X_train, Y_train) #ypreds = calibrated_clf.predict(X_test) #print accuracy_score(Y_test,ypreds) ypreds = calibrated_clf.predict_proba(X_test) print "%.2f" % log_loss(Y_test, ypreds, eps=1e-15, normalize=True)
def test_sample_weight_warning(): n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=len(y)) X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test = X[n_samples:] for method in ['sigmoid', 'isotonic']: base_estimator = LinearSVC(random_state=42) calibrated_clf = CalibratedClassifierCV(base_estimator, method=method) # LinearSVC does not currently support sample weights but they # can still be used for the calibration step (with a warning) msg = "LinearSVC does not support sample_weight." assert_warns_message( UserWarning, msg, calibrated_clf.fit, X_train, y_train, sample_weight=sw_train) probs_with_sw = calibrated_clf.predict_proba(X_test) # As the weights are used for the calibration, they should still yield # a different predictions calibrated_clf.fit(X_train, y_train) probs_without_sw = calibrated_clf.predict_proba(X_test) diff = np.linalg.norm(probs_with_sw - probs_without_sw) assert_greater(diff, 0.1)
def svm_boost_calib_scale(x,y,x_test,seed): # normalize x+x_test x_rows = x.shape[0] X = preprocessing.scale(np.vstack((x,x_test))) x = X[:x_rows,:] x_test = X[x_rows:, :] print x.shape print x_test.shape model = SVC(probability=True, class_weight='auto', random_state=seed, C= 100,gamma=0.0) boosted = AdaBoostClassifier(model, random_state=seed) # avg CV AUC PLS cv = StratifiedKFold(y, n_folds=10, random_state=seed) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) calib = CalibratedClassifierCV(boosted, cv=10, method='isotonic') for i, (train, test) in enumerate(cv): probas_ = calib.fit(x[train], y[train]).predict_proba(x[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print('Training set 10CV AUC:\n{}'.format(mean_auc)) # return probs #model = SVC(probability=True, random_state=seed) #model = model.fit(x, y) probs = np.average([cls.predict_proba(x_test) for cls in calib.calibrated_classifiers_], axis=0) print probs.shape #print('Training set acc:\n{}'.format(model2.score(x, y))) #bids_test_probs = model2.predict_proba(x_test) return probs
def calibrate(X_val, y_val, estimator): clf = CalibratedClassifierCV(base_estimator=estimator, method='isotonic', cv='prefit') clf.fit(X_val, y_val) return clf
def predict(self, X, thres=0.5, return_proba=True): """ Predict class for X. The predicted class of an input sample is a vote by the trees in the forest, weighted by their probability estimates. That is, the predicted class is the one with highest mean probability estimate across the trees. """ if self._model == 'svc_lin': from sklearn.base import clone from sklearn.calibration import CalibratedClassifierCV clf = CalibratedClassifierCV(clone(self._estimator).set_param( **self._estimator.get_param())) train_y = self._Xtrain[[self._rate_column]].values.ravel().tolist() self._estimator = clf.fit(self._Xtrain, train_y) proba = np.array(self._estimator.predict_proba(X)) if proba.shape[1] > 2: pred = (proba > thres).astype(int) else: pred = (proba[:, 1] > thres).astype(int) if return_proba: return proba, pred return pred
def internal_processing(self, X, y, X_test): """ """ Xs = np.hsplit(X, 5) Xts = np.hsplit(X_test, 5) Xts_cal = [] for i in range(len(Xs)): Xts_cal.append(calibrate(Xs[i], y, Xts[i])) XX_test = np.hstack(Xts_cal) ec = EC(n_preds=5) ec.fit(X, y) y_ens = ec.predict_proba(XX_test) # y_pred = ec.predict_proba(X_test) #validation yv = ec.predict_proba(X) print 'Weights: %s' %(ec.w) print 'Validation log-loss: %s' %(logloss_mc(y, yv)) cc = CalibratedClassifierCV(base_estimator=EC(n_preds=5), method='isotonic', cv=10) cc.fit(X, y) y_cal = cc.predict_proba(XX_test) y_pred = (y_ens + y_cal)/2. return y_pred
def svm_calib(x, y, x_test, seed): model = SVC(probability=True, random_state=seed) # avg CV AUC PLS cv = StratifiedKFold(y, n_folds=10, random_state=seed) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) calib = CalibratedClassifierCV(model, cv=10, method='isotonic') for i, (train, test) in enumerate(cv): probas_ = calib.fit(x[train], y[train]).predict_proba(x[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print('Training set 10CV AUC:\n{}'.format(mean_auc)) # return probs #model = SVC(probability=True, random_state=seed) #model = model.fit(x, y) probs = np.average([cls.predict_proba(x_test) for cls in calib.calibrated_classifiers_], axis=0) print probs.shape #print('Training set acc:\n{}'.format(model2.score(x, y))) #bids_test_probs = model2.predict_proba(x_test) return probs
def get_score(self, params): params['n_estimators'] = int(params['n_estimators']) params['max_depth'] = int(params['max_depth']) params['min_samples_split'] = int(params['min_samples_split']) params['min_samples_leaf'] = int(params['min_samples_leaf']) params['n_estimators'] = int(params['n_estimators']) print('Training with params:') print(params) # cross validation here scores = [] for train_ix, test_ix in makeKFold(5, self.y, 1): X_train, y_train = self.X[train_ix, :], self.y[train_ix] X_test, y_test = self.X[test_ix, :], self.y[test_ix] weight = y_train.shape[0] / (2 * np.bincount(y_train)) sample_weight = np.array([weight[i] for i in y_train]) clf = RandomForestClassifier(**params) cclf = CalibratedClassifierCV(base_estimator=clf, method='isotonic', cv=makeKFold(3, y_train, 1)) cclf.fit(X_train, y_train, sample_weight) pred = cclf.predict(X_test) scores.append(f1_score(y_true=y_test, y_pred=pred)) print(scores) score = np.mean(scores) print(score) return {'loss': -score, 'status': STATUS_OK}
def train_model_rfc_calibrated (features, labels) : # First, set aside a some of the training set for calibration # Use stratified shuffle split so that class ratios are maintained after the split splitter = StratifiedShuffleSplit(labels, n_iter = 1, train_size = 0.7, random_state = 30) # Length is 1 in this case since we have a single fold for splitting print (len(splitter)) for train_idx, calib_idx in splitter: features_train, features_calib = features[train_idx], features[calib_idx] labels_train, labels_calib = labels[train_idx], labels[calib_idx] print ("features_train shape: ", features_train.shape) print ("features_calib shape: ", features_calib.shape) print ("labels_train shape: ", labels_train.shape) print ("labels_calib shape: ", labels_calib.shape) print ("Performing Grid Search ...") # params_dict = {'criterion': ['entropy'], 'n_estimators':[30, 35, 40, 45], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10]} params_dict = {'criterion': ['entropy'], 'n_estimators':[60, 70, 80, 90], 'max_depth':[5, 6], 'min_samples_leaf': [1, 2, 5], 'min_samples_split': [2, 5, 10], 'max_features' : [6, 7, 8]} clf = GridSearchCV(rfc(random_state = 30, n_jobs = 4), params_dict, scoring = 'roc_auc', cv = 5) clf.fit(features_train, labels_train) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) # print ("Best grid scores: ", clf.grid_scores_) # Perform calibration # Use 'sigmoid' because sklearn cautions against using 'isotonic' for lesser than 1000 calibration samples as it can result in overfitting print ("Performing Calibration now ...") sigmoid = CalibratedClassifierCV(clf, cv='prefit', method='sigmoid') sigmoid.fit(features_calib, labels_calib) return sigmoid
def svc_test2(): """ Submission: E_val: E_in: E_out: """ from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.calibration import CalibratedClassifierCV X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) svc = SVC(kernel='linear', class_weight='auto', cache_size=10240) svc.fit(X_scaled, y) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_scaled, y) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', Util.auc_score(isotonic, X_scaled, y))
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=29008, max_features=36, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=3, min_weight_fraction_leaf=0.0, n_estimators=4494, n_jobs=8, oob_score=False, random_state=979271, verbose=0, warm_start=False) clf.fit(train_x, train_y) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv="prefit") ccv.fit(valid_x,valid_y) valid_predictions = ccv.predict_proba(valid_x) test_predictions= ccv.predict_proba(test_x) loss = test(valid_y,valid_predictions,True) if loss<0.52: data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def main(): X, Y, encoder, scale = load_train_data('train.csv') estimators = 500 X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0) X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=0.2, random_state=42) log.info('Loaded training file') X_test, _ = load_csv_file('test.csv', cut_end=False) log.info('Loaded test file') #Classifier Setup tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1, random_state=42, max_depth=55, min_samples_split=1) clf = make_pipeline(TfidfTransformer(), DenseTransformer(), tree_clf) log.info('Fitting GradientBoost') clf.fit(X_train_real, Y_train_real) clf_probs = clf.predict_proba(X_test_real) score = log_loss(Y_test_real, clf_probs) log.info('Log Loss score un-trained = %f' % score) # Calibrate Classifier using ground truth in X,Y_valid sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit") log.info('Fitting CalibratedClassifierCV') sig_clf.fit(X_valid, Y_valid) sig_clf_probs = sig_clf.predict_proba(X_test_real) sig_score = log_loss(Y_test_real, sig_clf_probs) log.info('Log loss score trained = %f' % sig_score) # Ok lets predict the test data with our funky new classifier sig_submission_probs = sig_clf.predict_proba(X_test) write_out_submission(sig_submission_probs, 'submission.csv')
def prepare_model(self, obj_fn=None, num_steps=None, model_params=None, batch_size: int = None): model = CalibratedClassifierCV(KNeighborsClassifier(**model_params), method="sigmoid") model_clf = model.fit(self.ds[self.data_groups["data_train_group"]].to_ndarray(), self.ds[self.data_groups["target_train_group"]].to_ndarray()) cal_model = CalibratedClassifierCV(model_clf, method="sigmoid", cv="prefit") cal_model.fit(self.ds[self.data_groups["data_validation_group"]].to_ndarray(), self.ds[self.data_groups["target_validation_group"]].to_ndarray()) return self.ml_model(cal_model)
def train_test(self, X, y, X_test): """ """ sss = StratifiedShuffleSplit(y, 1, test_size=0.5) for train_id, valid_id in sss: X0, X1 = X[train_id], X[valid_id] y0, y1 = y[train_id], y[valid_id] #First half w0 = np.zeros(len(y0)) for i in range(len(w0)): w0[i] = self.w[int(y0[i])] xg0_train = DMatrix(X0, label=y0, weight=w0) xg0_test = DMatrix(X1, label=y1) xgt_test = DMatrix(X_test) bst0 = my_train_xgboost(self.param, xg0_train, self.num_round) y0_pred = bst0.predict(xg0_test).reshape(X1.shape[0], 9) yt_pred = bst0.predict(xgt_test).reshape(X_test.shape[0], 9) #Calibrated RF rf = RandomForestClassifier(n_estimators=600, criterion='gini', class_weight='auto', max_features='auto') cal = CalibratedClassifierCV(rf, method='isotonic', cv=3) cal.fit(X0, y0) y0_cal = cal.predict_proba(X1) yt_cal = cal.predict_proba(X_test) #Second half ss = StandardScaler() y0_pred = ss.fit_transform(y0_pred) yt_pred = ss.fit_transform(yt_pred) y0_cal = ss.fit_transform(y0_cal) yt_cal = ss.fit_transform(yt_cal) X1 = np.hstack((X1, y0_pred, y0_cal)) X_test = np.hstack((X_test, yt_pred, yt_cal)) w1 = np.zeros(len(y1)) # self.param['eta'] = 0.01 self.num_round = 450 for i in range(len(w1)): w1[i] = self.w[int(y1[i])] xg1_train = DMatrix(X1, label=y1, weight=w1) xg_test= DMatrix(X_test) bst1 = my_train_xgboost(self.param, xg1_train, self.num_round) y_pred = bst1.predict(xg_test).reshape(X_test.shape[0], 9) return y_pred
def get_model(params, X, y): clf = RandomForestClassifier(**params) cclf = CalibratedClassifierCV(base_estimator=clf, method='isotonic', cv=makeKFold(3, y, 1)) weight = y.shape[0] / (2 * np.bincount(y)) sample_weight = np.array([weight[i] for i in y]) cclf.fit(X, y, sample_weight) return cclf
def train_validate(self, X_train, y_train, X_valid, y_valid): """ """ sss = StratifiedShuffleSplit(y_train, 1, test_size=0.5) for train_id, valid_id in sss: X0_train, X1_train = X_train[train_id], X_train[valid_id] y0_train, y1_train = y_train[train_id], y_train[valid_id] #First half w0_train = np.zeros(len(y0_train)) for i in range(len(w0_train)): w0_train[i] = self.w[int(y0_train[i])] xg0_train = DMatrix(X0_train, label=y0_train, weight=w0_train) xg0_valid = DMatrix(X1_train, label=y1_train) xgv_valid = DMatrix(X_valid, label=y_valid) watchlist = [(xg0_train,'train'), (xg0_valid, 'validation0')] # bst0 = train(self.param, xg0_train, self.num_round, watchlist) bst0 = my_train_xgboost(self.param, xg0_train, self.num_round, watchlist) y0_pred = bst0.predict(xg0_valid).reshape(X1_train.shape[0], 9) yv_pred = bst0.predict(xgv_valid).reshape(X_valid.shape[0], 9) #Calibrated RF rf = RandomForestClassifier(n_estimators=600, criterion='gini', class_weight='auto', max_features='auto') cal = CalibratedClassifierCV(rf, method='isotonic', cv=3) cal.fit(X0_train, y0_train) y0_cal = cal.predict_proba(X1_train) yv_cal = cal.predict_proba(X_valid) #Second half ss = StandardScaler() y0_pred = ss.fit_transform(y0_pred) yv_pred = ss.fit_transform(yv_pred) y0_cal = ss.fit_transform(y0_cal) yv_cal = ss.fit_transform(yv_cal) X1_train = np.hstack((X1_train, y0_pred, y0_cal)) X_valid = np.hstack((X_valid, yv_pred, yv_cal)) w1_train = np.zeros(len(y1_train)) # self.param['eta'] = 0.05 self.num_round = 450 for i in range(len(w1_train)): w1_train[i] = self.w[int(y1_train[i])] xg1_train = DMatrix(X1_train, label=y1_train, weight=w1_train) xg_valid = DMatrix(X_valid, label=y_valid) watchlist = [(xg1_train,'train'), (xg_valid, 'validation')] # bst1 = train(self.param, xg1_train, self.num_round, watchlist) bst1 = my_train_xgboost(self.param, xg1_train, self.num_round, watchlist) y_pred = bst1.predict(xg_valid).reshape(X_valid.shape[0], 9) # pdb.set_trace() return y_pred
def prepare_model(self, obj_fn=None, num_steps=None, model_params=None, batch_size: int = None): if model_params is None: model_params = dict(n_estimators=25, min_samples_split=2) model = CalibratedClassifierCV(RandomForestClassifier(**model_params), method="sigmoid") model_clf = model.fit(self.ds[self.data_groups["data_train_group"]].to_ndarray(), self.ds[self.data_groups["target_train_group"]].to_ndarray()) cal_model = CalibratedClassifierCV(model_clf, method="sigmoid", cv="prefit") cal_model.fit(self.ds[self.data_groups["data_validation_group"]].to_ndarray(), self.ds[self.data_groups["target_validation_group"]].to_ndarray()) return self.ml_model(cal_model)
def test_calibration_prob_sum(): # Test that sum of probabilities is 1. A non-regression test for # issue #7796 num_classes = 2 X, y = make_classification(n_samples=10, n_features=5, n_classes=num_classes) clf = LinearSVC(C=1.0) clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
def trainrf(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) clf = RandomForestClassifier(n_estimators=random.randint(50,5000), criterion='gini', max_depth=random.randint(10,1000), min_samples_split=random.randint(2,50), min_samples_leaf=random.randint(1,10), min_weight_fraction_leaf=random.uniform(0.0,0.5), max_features=random.uniform(0.1,1.0), max_leaf_nodes=random.randint(1,10), bootstrap=False, oob_score=False, n_jobs=30, random_state=random_state, verbose=0, warm_start=True, class_weight=None ) clf.fit(train_x, train_y) valid_predictions1 = clf.predict_proba(valid_x) test_predictions1= clf.predict_proba(test_x) t1 = test(valid_y,valid_predictions1) ccv = CalibratedClassifierCV(base_estimator=clf,method="sigmoid",cv='prefit') ccv.fit(valid_x,valid_y) valid_predictions2 = ccv.predict_proba(valid_x) test_predictions2= ccv.predict_proba(test_x) t2 = test(valid_y,valid_predictions2) if t2<t1: valid_predictions=valid_predictions2 test_predictions=test_predictions2 t=t2 else: valid_predictions=valid_predictions1 test_predictions=test_predictions1 t=t1 if t < 0.450: data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def hold_out_evaluation(classifier, x, y, test_size=0.2, calibrate=False): x_train, y_train, x_valid, y_valid = stratified_split(x, y, test_size) # Train if calibrate: # Make training and calibration calibrated_classifier = CalibratedClassifierCV(classifier, method='isotonic', cv=get_cv(y_train)) fitted_classifier = calibrated_classifier.fit(x_train, y_train) else: fitted_classifier = classifier.fit(x_train, y_train) # Evaluate score = log_loss(y_valid, fitted_classifier.predict_proba(x_valid)) return score
def get_model(params, X, y_array, y_ix, reps): y = y_array[:, y_ix] params['bootstrap'] = False params['oob_score'] = False params['n_jobs'] = -1 clf = RandomForestClassifier(**params) cclf = CalibratedClassifierCV(base_estimator=clf, method='isotonic', cv=makeKFold(3, y, reps)) weight = y.shape[0] / (2 * np.bincount(y)) sample_weight = np.array([weight[i] for i in y]) cclf.fit(X, y, sample_weight) return cclf
def setTrainDataAndMakeModel(X_train,Y_train,X_test): model = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=27, max_features='log2', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=5, min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) model.fit(X_train,Y_train) calibrated_clf = CalibratedClassifierCV(model, method='isotonic', cv=5) calibrated_clf.fit(X_train, Y_train) ypreds = calibrated_clf.predict_proba(X_test) return ypreds
def gb_calib_scale(x,y,x_test,seed): # normalize x+x_test x_rows = x.shape[0] X = preprocessing.scale(np.vstack((x,x_test))) x = X[:x_rows,:] x_test = X[x_rows:, :] print x.shape print x_test.shape model0 = SVC(probability=True, class_weight='auto', random_state=seed, C=1,gamma=0.1) # f*****g bugs in sklearn class WrapClassifier: def __init__(self, est): self.est = est def predict(self, X): return self.est.predict_proba(X)[:,1][:,np.newaxis] def fit(self, X, y, sample_weight): self.est.fit(X, y, sample_weight) model01 = WrapClassifier(model0) model1 = GradientBoostingClassifier(max_depth=3, random_state=seed, learning_rate=0.1, n_estimators=100, max_features=400) model = BaggingClassifier(model1, n_jobs=-1, random_state=seed) # avg CV AUC PLS n_folds = 10 cv = StratifiedKFold(y, n_folds=n_folds, random_state=seed) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) probs_list = [] calib = CalibratedClassifierCV(model, cv=n_folds, method='sigmoid') for i, (train, test) in enumerate(cv): probas_ = calib.fit(x[train], y[train]).predict_proba(x[test]) #probas_ = model.fit(x[train], y[train]).predict_proba(x[test]) #probs_list.append(model.predict_proba(x_test)) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print('Training set 10CV AUC:\n{}'.format(mean_auc)) # return probs #model = SVC(probability=True, random_state=seed) #model = model.fit(x, y) #probs = np.average(probs_list, axis=0) probs = np.average([cls.predict_proba(x_test) for cls in calib.calibrated_classifiers_], axis=0) print probs.shape #print('Training set acc:\n{}'.format(model2.score(x, y))) #bids_test_probs = model2.predict_proba(x_test) return probs
def svm_boost_isotonic_scale_train(x, y, x_test, seed): x_rows = x.shape[0] X = preprocessing.scale(np.vstack((x,x_test))) x = X[:x_rows,:] x_test = X[x_rows:, :] #x_tr, x_val, y_tr, y_val = train_test_split( # x, y, test_size=0.1, random_state=seed) model = SVC(probability=True, class_weight='auto', random_state=seed) boosted = AdaBoostClassifier(model, random_state=seed) calib = CalibratedClassifierCV(boosted, cv=2, method='isotonic') calib.fit(x, y) probs = calib.predict_proba(x_test) return probs
def predict_proba(clfs,X,y,X_test,weights,calibartion=False): skf = StratifiedKFold(y, n_folds=5,random_state=571) n = len(clfs) preds = [] for clf in clfs: if calibartion == True: clf = CalibratedClassifierCV(clf,method="isotonic",cv=skf) clf.fit(X,y) y_pred = clf.predict_proba(X_test) preds.append(y_pred) final_pred = preds.pop(0) for pred,weight in zip(preds,weights): final_pred += weight * pred final_pred = final_pred/np.array(weights).sum() return final_pred
args.t, args.approx, args.I, args.delta, ) skip_variance = args.skip_variance ### Read the data reader = FastaUtility() Xtrain, Ytrain = reader.read_data(train_file) Xtest, Ytest = reader.read_data(test_file) Ytest = np.array(Ytest).reshape(-1, 1) ### Compute the fastsk kernel start = time.time() fastsk = FastSK( g=g, m=m, t=t, approx=approx, max_iters=I, delta=d, skip_variance=skip_variance ) fastsk.compute_kernel(Xtrain, Xtest) end = time.time() print("Kernel computation time: ", end - start) Xtrain = fastsk.get_train_kernel() Xtest = fastsk.get_test_kernel() ### Use linear SVM svm = LinearSVC(C=C) clf = CalibratedClassifierCV(svm, cv=5).fit(Xtrain, Ytrain) acc, auc = evaluate_clf(clf, Xtest, Ytest) print("Linear SVM:\n\tAcc = {}, AUC = {}".format(acc, auc))
def expressionTest(): trainX,trainY = pickle.load(open('MultiPieTrainExpression_XY.p','rb')) testX,testY = pickle.load(open('MultiPieValidationExpression_XY.p','rb')) f1 = open('RandomForestCompareResExpression.txt','w+') clf_uncalibrated = RandomForestClassifier(n_estimators=1000,random_state=15325) clf_uncalibrated = clf_uncalibrated.fit(trainX,trainY) clf = CalibratedClassifierCV(clf_uncalibrated, cv=3, method='sigmoid') clf.fit(trainX, trainY) pickle.dump(clf,open('expression_multipie_rf_calibrated.p','wb')) #clf=pickle.load(open('gender_randomForest.p', 'rb')) tn = time.time() probX = clf.predict_proba(testX) preY = clf.predict(testX) et = time.time() - tn print probX #testY, probX[:,1] #storing the images along with probability values in the validation dataset. imageNumbers = range(0,len(testY)) print imageNumbers with open('rf_expression_imageProbabilities.csv', 'wb') as f: writer = csv.writer(f) rows = zip(imageNumbers,probX[:,1]) for row in rows: writer.writerow(row) #calculation and plot of roc_auc for male totalMale = testY.count(1) totalNotMale = testY.count(0) print totalMale print totalNotMale #totalMale = sum(testY==1)*1.0 #totalNotMale = sum(testY==0)*1.0 roc_auc = dict() fpr, tpr, thresholds = roc_curve(testY, probX[:,1], pos_label=1) roc_auc = auc(fpr, tpr) print>>f1,roc_auc print>>f1,fpr print>>f1,tpr print>>f1,'thresholds' print>>f1,thresholds #storing the threshold value along with the tpr and fpr values. with open('rf_expression_threhsolds.csv', 'wb') as f: writer = csv.writer(f) rows = zip(thresholds,tpr,fpr) for row in rows: writer.writerow(row) #pickle.dump([fpr,tpr,thresholds],open('rf_threshold.p','wb')) ''' print>>f1,'total detection' print>>f1,fpr+tpr ''' plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig('rocExpressionRF.png') plt.show() #choosing the best threshold on roc curve mindist = 100; minI=0; for i in range(len(fpr)): a = np.array((fpr[i],tpr[i])) b = np.array((0,1)) dist_a_b = distance.euclidean(a,b) if dist_a_b < mindist: mindist = dist_a_b minI =i minX = fpr[minI] minY = tpr[minI] threshold = thresholds[minI] print>>f1, 'minX :%f, minY: %f, mindist:%f, Threshold = %f, minI =%d , fpr[min]=%f, tpr[min]=%f, false detection value= %f, true detection value =%f '%(minX,minY,mindist,threshold,minI, fpr[minI],tpr[minI], fpr[minI]*totalNotMale,tpr[minI]*totalMale) #storing the selectivity and accuracy values prec = sum(preY == testY)*1.0/len(preY) select_1 = sum(preY==1)*1.0/len(preY) #Male select_2 = sum(preY==0)*1.0/len(preY) #Female f1.write('Gender time: %f acc. %f select_1 %f preY:%f testY:%f testX:%f select_2 %f\n'%\ (et/len(preY),prec,select_1,len(preY),len(testY),len(testX),select_2)) list1, list2 = (list(x) for x in zip(*sorted(zip(probX[:,1], testY), key=lambda pair: pair[0]))) print list1 print list2 # lower correct is for low threshold lowerCorrect = (list2.index(1)) higherCorrect = ((list2[::-1].index(0) + 1) -1) yesAccuracy = float(higherCorrect)/ len(list2) noAccuracy = float(lowerCorrect) / len(list2) print 'higherCorrect : %d, yesAccuracy : %f, lowerCorrect: %d, noAccuracy: %f'%(higherCorrect,yesAccuracy,lowerCorrect,noAccuracy) print 'lower threshold: %f , upperThreshold: %f'%(list1[lowerCorrect],list1[len(list1)-higherCorrect]) print>>f1,'higherCorrect : %d, yesAccuracy : %f, lowerCorrect: %d, noAccuracy: %f'%(higherCorrect,yesAccuracy,lowerCorrect,noAccuracy) print>>f1,'lower threshold: %f , upperThreshold: %f'%(list1[lowerCorrect],list1[len(list1)-higherCorrect]) for prob in (probX): print>>f1, prob print>>f1, 'predicted value' for pred in preY : print>>f1, pred print>>f1, 'True value' for truth in testY : print>>f1, truth for sortedProb in list1 : print>>f1, sortedProb for sortedTruth in list2 : print>>f1, sortedTruth f1.flush()
def get_hyperparameters(model): """ Generates the models with different hyperparameters to be trained and evaluated using spatial cross validation. Args: model (str) : A string indicating the model or classifier to fetch hyperparameters for. Supported models include 'logistic_regression', 'random_forest', and 'linear_svc'. Returns: models (list) : A list of models, where each model is instantiated using different hyperparameter settings. labels (list) : A list of labels indicating the corresponding model hyperparameters in string format. The labels are used for plotting charts and file naming schemes. """ if model == 'logistic_regression': param_grid = {'penalty': ['l2', 'l1'], 'C': [0.001, 0.01, 0.1, 1]} params = list( itertools.product(*[param_grid[param] for param in param_grid])) models, labels = [], [] for param in params: models.append(LogisticRegression(penalty=param[0], C=param[1])) labels.append('penalty={}, C={:.3f}'.format(param[0], param[1])) return models, labels if model == 'linear_svc': param_grid = { 'C': [0.001, 0.01, 0.1, 1], } params = list( itertools.product(*[param_grid[param] for param in param_grid])) models, labels = [], [] for param in params: models.append( CalibratedClassifierCV(LinearSVC(C=param[0], random_state=SEED))) labels.append('C={:.3f}'.format(param[0])) return models, labels if model == 'random_forest': param_grid = { 'n_estimators': [100, 300, 500, 800, 1200], 'max_depth': [5, 8, 12], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 5, 10] } params = list( itertools.product(*[param_grid[param] for param in param_grid])) # Randomly sample 5 parameter settings due to # the large number of combinations random.seed(SEED) params = random.sample(params, 5) models, labels = [], [] for param in params: models.append( RandomForestClassifier(n_estimators=param[0], max_depth=param[1], min_samples_split=param[2], min_samples_leaf=param[3], random_state=SEED)) labels.append( 'n_estimators={}, max_depth={}, min_samples_split={}, min_samples_leaf={}' .format(param[0], param[1], param[2], param[3])) return models, labels
def main(configuration_path, signal_path, background_path, predictions_path, model_path, verbose): ''' Train a classifier on signal and background monte carlo data and write the model to MODEL_PATH in pmml or pickle format. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data BACKGROUND_PATH: Path to the background data PREDICTIONS_PATH : path to the file where the mc predictions are stored. MODEL_PATH: Path to save the model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' log = setup_logging(verbose=verbose) check_extension(predictions_path) check_extension(model_path, allowed_extensions=['.pmml', '.pkl', '.onnx']) config = AICTConfig.from_yaml(configuration_path) model_config = config.separator label_text = model_config.output_name log.info('Loading signal data') df_signal = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal) df_signal['label_text'] = 'signal' df_signal['label'] = 1 log.info('Loading background data') df_background = read_telescope_data( background_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_background) df_background['label_text'] = 'background' df_background['label'] = 0 df = pd.concat([df_background, df_signal], ignore_index=True) df_train = convert_to_float32(df[model_config.features]) log.debug('Total training events: {}'.format(len(df_train))) df_train.dropna(how='any', inplace=True) log.debug('Training events after dropping nans: {}'.format(len(df_train))) label = df.loc[df_train.index, 'label'] # load optional columns if available to be able to make performance plots # vs true energy / size if config.true_energy_column is not None: true_energy = df.loc[df_train.index, config.true_energy_column].to_numpy() if config.size_column is not None: size = df.loc[df_train.index, config.size_column].to_numpy() n_gammas = len(label[label == 1]) n_protons = len(label[label == 0]) log.info( 'Training classifier with {} background and {} signal events'.format( n_protons, n_gammas)) log.debug(model_config.features) # save prediction_path for each cv iteration cv_predictions = [] # iterate over test and training sets X = df_train.values y = label.values n_cross_validations = model_config.n_cross_validations classifier = model_config.model log.info( 'Starting {} fold cross validation... '.format(n_cross_validations)) stratified_kfold = model_selection.StratifiedKFold( n_splits=n_cross_validations, shuffle=True, random_state=config.seed) aucs = [] cv_it = stratified_kfold.split(X, y) for fold, (train, test) in enumerate(tqdm(cv_it, total=n_cross_validations)): # select data xtrain, xtest = X[train], X[test] ytrain, ytest = y[train], y[test] # fit and predict classifier.fit(xtrain, ytrain) y_probas = classifier.predict_proba(xtest)[:, 1] cv_df = pd.DataFrame({ 'label': ytest, model_config.output_name: y_probas, 'cv_fold': fold, }) if config.true_energy_column is not None: cv_df[config.true_energy_column] = true_energy[test] if config.size_column is not None: cv_df[config.size_column] = size[test] cv_predictions.append(cv_df) aucs.append(metrics.roc_auc_score(ytest, y_probas)) aucs = np.array(aucs) log.info('Cross-validation ROC-AUCs: {}'.format(aucs)) log.info('Mean AUC ROC : {:.3f} ± {:.3f}'.format(aucs.mean(), aucs.std())) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('Writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) classifier.random_state = config.seed if model_config.calibrate_classifier: log.info('Training calibrated classifier') classifier = CalibratedClassifierCV(classifier, cv=2, method='sigmoid') classifier.fit(X, y) else: log.info('Training model on complete dataset') classifier.fit(X, y) log.info('Saving model to {} ...'.format(model_path)) save_model(classifier, model_path=model_path, label_text=label_text, feature_names=list(df_train.columns))
y[:n_samples // 2] = 0 y[n_samples // 2:] = 1 sample_weight = np.random.RandomState(42).rand(y.shape[0]) # split train, test for calibration X_train, X_test, y_train, y_test, sw_train, sw_test = \ train_test_split(X, y, sample_weight, test_size=0.9, random_state=42) # Gaussian Naive-Bayes with no calibration clf = GaussianNB() clf.fit(X_train, y_train) # GaussianNB itself does not support sample-weights prob_pos_clf = clf.predict_proba(X_test)[:, 1] # Gaussian Naive-Bayes with isotonic calibration clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_isotonic.fit(X_train, y_train, sw_train) prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1] # Gaussian Naive-Bayes with sigmoid calibration clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid') clf_sigmoid.fit(X_train, y_train, sw_train) prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1] print("Brier scores: (the smaller the better)") clf_score = brier_score_loss(y_test, prob_pos_clf, sw_test) print("No calibration: %1.3f" % clf_score) clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, sw_test) print("With isotonic calibration: %1.3f" % clf_isotonic_score)
def _svc(self): # self.model = SVC(kernel='linear', C=1000) self.model = LinearSVC(C=200) self.clf = CalibratedClassifierCV(self.model, method='sigmoid') self.clf.fit(self.features, self.dialects) self.model.fit(self.features, self.dialects)
def majority_vote_cl(v: int, n: int, delta: float, train_file: str, test_file: str, min_token_freq: int = 1, max_token_freq: float = 1.0): """ Entry point of program. :param max_token_freq: ignore terms that have a document frequency strictly higher than the given proportion. :param min_token_freq: ignore terms that have a document frequency strictly lower than the given threshold. :param v: Vocabulary choice :param n: ngram choice :param delta: Smoothing choice :param train_file: Path to training data :param test_file: Path to testing data :return: void """ validate_params(v, n, delta, train_file, test_file) # Process data train_data = pd.read_csv( train_file, delimiter='\t', names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET]) test_data = pd.read_csv( test_file, delimiter='\t', names=[DF_COLUMN_ID, DF_COLUMN_NAME, DF_COLUMN_LANG, DF_COLUMN_TWEET]) lang_mapping, inv_lang_mapping = encode_class_labels( train_data[DF_COLUMN_LANG]) train_data[DF_COLUMN_LANG] = train_data[DF_COLUMN_LANG].map(lang_mapping) custom_transform_to_vocab(train_data, v) custom_transform_to_vocab(test_data, v) # Prepare features (Ngrams and their weights) tfidf = TfidfVectorizer(analyzer='char_wb', lowercase=False, ngram_range=(n, n), min_df=min_token_freq, max_df=max_token_freq) features = tfidf.fit_transform(train_data[DF_COLUMN_TWEET]).toarray() labels = train_data[DF_COLUMN_LANG] # Define Estimators svc = LinearSVC() svc_calibrated = CalibratedClassifierCV(svc) lr = LogisticRegression(multi_class='multinomial', max_iter=500) estimators = [('lr', lr), ('svc_calibrated', svc_calibrated)] # Train model voting_classifier = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1) voting_classifier.fit(features, labels) # Calculate scores features_test = tfidf.transform(test_data[DF_COLUMN_TWEET]) guess = voting_classifier.predict(features_test) scores = voting_classifier.predict_proba(features_test) # Finalize results results = prepare_result_df(test_data) results[DF_COLUMN_SCORE] = scores results[DF_COLUMN_GUESS] = guess results[DF_COLUMN_GUESS] = results[DF_COLUMN_GUESS].map(inv_lang_mapping) results = finalize_result_df(results) generate_trace_file(v, n, delta, results) # Evaluation stats print( "\nEvaluating Majority Vote classifier with parameters: [vocabulary = {}, ngram size = {}, delta = {}]" .format(v, n, delta)) evaluate_results(results, v, n, delta) return results
def trainAndSaveMetaClassifier(X_metaTraining, y_meta, classifierType): ''' Trains a meta classifier from a training feature vector, built from data dedicated to the meta classifier training, called X_metaTraining. @param X_metaTraining: Feature vector for the meta classifier training. Predictions from low layers will be done from this vector. @param y_meta: Truth vector regarding the X_metaTraining vector @param classifierType: The type of classifier to train. Possible values are 'color-histogram', 'face-detection', 'lbp', 'object-detection'. ''' input_meta_image = getInputMetaImageFromLowClassifiers( X_metaTraining, classifierType) # Training the meta classifier from the outputs of low classifier for the current feature from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.svm import LinearSVC print('Training meta classifier for ' + classifierType + ' -- This may take some time..') classifiers = [ MultinomialNB(), RandomForestClassifier(random_state=0, n_estimators=20, n_jobs=-1, max_features=None), DecisionTreeClassifier(random_state=0, max_features=None), LinearSVC(random_state=0) ] best_classifier = None best_accuracy = 0 best_stdDev = 0 for clf in classifiers: current_scores = cross_val_score(clf, input_meta_image, y_meta, cv=20, scoring='accuracy', n_jobs=2, verbose=1) if current_scores.mean() > best_accuracy: best_accuracy = current_scores.mean() best_classifier = clf best_stdDev = current_scores.std() # Training the classifier on the whole data if str(type(best_classifier)) == "<class 'sklearn.svm.classes.LinearSVC'>": best_classifier = CalibratedClassifierCV(best_classifier) best_classifier.fit(input_meta_image, y_meta) # Saving the classifier import pickle pickle.dump( best_classifier, open( 'trained-classifiers/low-classifiers/' + classifierType + '/meta/' + classifierType + '-meta.p', "wb")) print('Best classifier saved for meta image:', best_classifier) print('Best accuracy:', best_accuracy) print('Standard deviation', best_stdDev)
class Classifiers: """ Usage: 1. Generating models: After creating instances of this class, invoke the method 'training' with the path as its parameter, and after that, when you want to get the resulting model, just invoke its class variable 'model' 2. Test model: Invoking 'testing' function with path to the test set as parameter, it will return f1 score """ def __init__(self, n_value, intercept, mode='character'): self.n_value = n_value self.mode = mode if self.mode != 'character' and self.mode != 'word': raise ValueError('the mode has to be either character or word') self.features = None self.dialects = list() self.features_names = list() self.length = 0 self.width = 0 self.model = None self.clf = None self.intercept = intercept self.test_dialects = None def _char_n_grams(self, sentence): return [ sentence[i:i + self.n_value] for i in range(len(sentence) - self.n_value + 1) ] def _word_n_grams(self, sentence): ngram = list() sentence = sentence.strip().split(" ") for i in range(len(sentence) - self.n_value + 1): gram = "" for j in range(self.n_value): gram = gram + sentence[i + j] ngram.append(gram) return ngram def training(self, training_set_path): sentences = list() with open(training_set_path, 'r', encoding='utf8') as training_file: for line in training_file: the_sentence, the_dialect = line.strip().split('\t') the_sentence = the_sentence.strip() sentences.append('#' + the_sentence + '#') self.dialects.append(the_dialect) if self.mode == 'character': tfidf = TfidfVectorizer(ngram_range=(self.n_value, self.n_value), analyzer='char') self.features = tfidf.fit_transform(sentences).toarray() self.features_names = tfidf.get_feature_names() elif self.mode == 'word': tfidf = TfidfVectorizer(ngram_range=(self.n_value, self.n_value), analyzer='word') self.features = tfidf.fit_transform(sentences).toarray() self.features_names = tfidf.get_feature_names() self.length = len(sentences) self.width = self.features.shape[1] self._svc() def _svc(self): # self.model = SVC(kernel='linear', C=1000) self.model = LinearSVC(C=200) self.clf = CalibratedClassifierCV(self.model, method='sigmoid') self.clf.fit(self.features, self.dialects) self.model.fit(self.features, self.dialects) def testing(self, testing_set_path): test_sentences = [] #self.test_dialects = [] with open(testing_set_path, 'r', encoding='utf8') as test_file: for line in test_file: #s, label = line.strip().split('\t') #s = s.strip() s = line.strip() test_sentences.append('#' + s + '#') #self.test_dialects.append(label) s_feat = [] if self.mode == 'character': for s in test_sentences: ngram = self._char_n_grams(s) s_feat.append(set(ngram)) elif self.mode == 'word': for s in test_sentences: ngram = self._word_n_grams(s) s_feat.append(set(ngram)) test_features = np.zeros((self.length, self.width), dtype=np.int8) for i, s in enumerate(s_feat): for j, ngram in enumerate(self.features_names): if ngram in s: test_features[i, j] += 1 #for i in range(self.length-len(test_sentences)): # self.test_dialects.append(self.intercept) result = self.model.predict(X=test_features) #f1_score = sklearn.metrics.f1_score(self.test_dialects, result[:len(self.test_dialects)], average='macro') probability_matrix, label = fusion_methods.mean_probability_rule( test_features, self.clf) # score = self.model.score(test_features, test_dialects) # accuracy = ((3000 * score - 2500) / 2000) * 100 return probability_matrix, label, result def get_test_dialects(self): return self.test_dialects
# calibration (see :ref:`User Guide <calibration>`) # # Calibration curves for all 4 conditions are plotted below, with the average # predicted probability for each bin on the x-axis and the fraction of positive # classes in each bin on the y-axis. import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB lr = LogisticRegression(C=1.0) gnb = GaussianNB() gnb_isotonic = CalibratedClassifierCV(gnb, cv=2, method="isotonic") gnb_sigmoid = CalibratedClassifierCV(gnb, cv=2, method="sigmoid") clf_list = [ (lr, "Logistic"), (gnb, "Naive Bayes"), (gnb_isotonic, "Naive Bayes + Isotonic"), (gnb_sigmoid, "Naive Bayes + Sigmoid"), ] # %% fig = plt.figure(figsize=(10, 10)) gs = GridSpec(4, 2) colors = plt.cm.get_cmap("Dark2") ax_calibration_curve = fig.add_subplot(gs[:2, :2])
def fscore(params_org): #print(params_org) parambk = copy.deepcopy(params_org) ifError =0 global best, HPOalg,params_best, errorcount params= params_org['classifier'] classifier = params.pop('name') p_random_state = params.pop('random_state') if (classifier == 'SVM'): param_value= params.pop('gamma_value') if(params['gamma'] == "value"): params['gamma'] = param_value else: pass clf = SVC(max_iter = 10000, cache_size= 700, random_state = p_random_state,**params) #max_iter=10000 and cache_size= 700 https://github.com/EpistasisLab/pennai/issues/223 #maxvalue https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L262 elif(classifier == 'RF'): clf = RandomForestClassifier(random_state = p_random_state, **params) elif(classifier == 'KNN'): p_value = params.pop('p') if(p_value==0): params['metric'] = "chebyshev" elif(p_value==1): params['metric'] = "manhattan" elif(p_value==2): params['metric'] = "euclidean" else: params['metric'] = "minkowski" params['p'] = p_value #https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L302 clf = KNeighborsClassifier(**params) elif(classifier == 'DTC'): clf = DecisionTreeClassifier(random_state = p_random_state, **params) elif(classifier == 'LR'): penalty_solver = params.pop('penalty_solver') params['penalty'] = penalty_solver.split("+")[0] params['solver'] = penalty_solver.split("+")[1] clf = LogisticRegression(random_state = p_random_state, **params) #resampling parameter p_sub_params= params_org.pop('sub') p_sub_type = p_sub_params.pop('type') sampler = p_sub_params.pop('smo_grp') gmean = [] if (p_sub_type == 'SMOTE'): smo = SMOTE(**p_sub_params) elif (p_sub_type == 'ADASYN'): smo = ADASYN(**p_sub_params) elif (p_sub_type == 'BorderlineSMOTE'): smo = BorderlineSMOTE(**p_sub_params) elif (p_sub_type == 'SVMSMOTE'): smo = SVMSMOTE(**p_sub_params) elif (p_sub_type == 'SMOTENC'): smo = SMOTENC(**p_sub_params) elif (p_sub_type == 'KMeansSMOTE'): smo = KMeansSMOTE(**p_sub_params) elif (p_sub_type == 'RandomOverSampler'): smo = RandomOverSampler(**p_sub_params) #Undersampling elif (p_sub_type == 'TomekLinks'): smo = TomekLinks(**p_sub_params) elif (p_sub_type == 'ClusterCentroids'): if(p_sub_params['estimator']=='KMeans'): p_sub_params['estimator']= KMeans(random_state = p_random_state) elif(p_sub_params['estimator']=='MiniBatchKMeans'): p_sub_params['estimator']= MiniBatchKMeans(random_state = p_random_state) smo = ClusterCentroids(**p_sub_params) elif (p_sub_type == 'RandomUnderSampler'): smo = RandomUnderSampler(**p_sub_params) elif (p_sub_type == 'NearMiss'): smo = NearMiss(**p_sub_params) elif (p_sub_type == 'InstanceHardnessThreshold'): if(p_sub_params['estimator']=='knn'): p_sub_params['estimator']= KNeighborsClassifier() elif(p_sub_params['estimator']=='decision-tree'): p_sub_params['estimator']=DecisionTreeClassifier() elif(p_sub_params['estimator']=='adaboost'): p_sub_params['estimator']=AdaBoostClassifier() elif(p_sub_params['estimator']=='gradient-boosting'): p_sub_params['estimator']=GradientBoostingClassifier() elif(p_sub_params['estimator']=='linear-svm'): p_sub_params['estimator']=CalibratedClassifierCV(LinearSVC()) elif(p_sub_params['estimator']=='random-forest'): p_sub_params['estimator']=RandomForestClassifier(n_estimators=100) smo = InstanceHardnessThreshold(**p_sub_params) elif (p_sub_type == 'CondensedNearestNeighbour'): smo = CondensedNearestNeighbour(**p_sub_params) elif (p_sub_type == 'EditedNearestNeighbours'): smo = EditedNearestNeighbours(**p_sub_params) elif (p_sub_type == 'RepeatedEditedNearestNeighbours'): smo = RepeatedEditedNearestNeighbours(**p_sub_params) elif (p_sub_type == 'AllKNN'): smo = AllKNN(**p_sub_params) elif (p_sub_type == 'NeighbourhoodCleaningRule'): smo = NeighbourhoodCleaningRule(**p_sub_params) elif (p_sub_type == 'OneSidedSelection'): smo = OneSidedSelection(**p_sub_params) #Combine elif (p_sub_type == 'SMOTEENN'): smo = SMOTEENN(**p_sub_params) elif (p_sub_type == 'SMOTETomek'): smo = SMOTETomek(**p_sub_params) e='' try: for train, test in cv.split(X, y): if(p_sub_type=='NO'): X_smo_train, y_smo_train = X[train], y[train] else: X_smo_train, y_smo_train = smo.fit_sample(X[train], y[train]) y_test_pred = clf.fit(X_smo_train, y_smo_train).predict(X[test]) gm = geometric_mean_score(y[test], y_test_pred, average='binary') gmean.append(gm) mean_g=np.mean(gmean) except Exception as eec: e=eec mean_g = 0 ifError =1 errorcount = errorcount+1 gm_loss = 1 - mean_g abc=time.time()-starttime if mean_g > best: best = mean_g params_best = copy.deepcopy(parambk) return {'loss': gm_loss, 'mean': mean_g, 'status': STATUS_OK, # -- store other results like this 'run_time': abc, 'iter': iid, 'current_best': best, 'eval_time': time.time(), 'SamplingGrp': sampler, 'SamplingType': p_sub_type, 'ifError': ifError, 'Error': e, 'params' : parambk, 'attachments': {'time_module': pickle.dumps(time.time)} }
#clf = ensemble.RandomForestClassifier(**paramsRF) clf = linear_model.LogisticRegressionCV(Cs=4, solver='liblinear', max_iter=1000, tol=1e-5, scoring='neg_log_loss') #clf = GaussianNB() #clf = linear_model.ElasticNetCV(l1_ratio=0) #clf = GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True) #clf=linear_model.LassoLarsIC(criterion='aic') clf.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv='prefit') sig_clf.fit(X_valid, y_valid) df_out.loc[test, 'prob1'] = sig_clf.predict_proba(X_test)[:, 0] df_out.loc[test, 'prob2'] = sig_clf.predict_proba(X_test)[:, 1] df_out.loc[:, 'log_loss1'] = df_out.loc[:, 'result'] * np.log(df_out.loc[:, 'prob1']) \ + (1 - df_out.loc[:, 'result']) * np.log((1 - df_out.loc[:, 'prob1'])) df_out.loc[:, 'log_loss2'] = df_out.loc[:, 'result'] * np.log(df_out.loc[:, 'prob2']) \ + (1 - df_out.loc[:, 'result']) * np.log((1 - df_out.loc[:, 'prob2'])) log_loss_1 = -df_out.loc[:, 'log_loss1'].sum() / len(df_out) log_loss_2 = -df_out.loc[:, 'log_loss2'].sum() / len(df_out) print(log_loss_1, log_loss_2)
def genTrainData(rule='部门名称', mode='thulac', role='AGENT', feature_type='TFIDF', ngram_range=(1, 3), _min=2, _max=0.9, _range=(0.4, 0.9), max_features=10000): print('This is the classification task on {}'.format(rule)) data = fenci(rule, mode) if role not in 'AGENT USER': data['sentenceList'] = data['sentenceList'].apply(str).apply(eval) \ .apply(lambda x: ' '.join([i['content'] for i in x])) else: data['sentenceList'] = data['sentenceList'].apply(str).apply(eval)\ .apply(lambda x: ' '.join([i['content'] for i in x if i['role'] == role])) data['label'] = data['label'].apply(str).apply(eval) # data.columns = ['UUID','sentenceList','label'] # test_data = pd.read_csv('{}/{}_test.csv'.format(sample_path, rule)) test_data = data.tail(int(0.2 * len(data.index))) _test = data.set_index('UUID').loc[test_data['UUID']] assert _test.shape[0] == test_data.shape[0] _train = data.set_index('UUID').drop(test_data['UUID']) assert _train.shape[0] + _test.shape[0] == data.shape[0] del (data, test_data) print('train/test:{}/{}'.format(_train.shape[0], _test.shape[0])) BDC_DF = select_Feature(_train['sentenceList'], _train['label'], _min=_min, ngram_range=ngram_range, _max=_max, _range=_range, max_features=max_features) _vocab = {j: i for i, j in enumerate(BDC_DF.index)} del (BDC_DF) if _test.shape[0] == 0: return # _vec = TfidfVectorizer(ngram_range=(1, 3), max_df=0.8, min_df=3, max_features=int(0.6*len(_vocab.keys()))) # _vec.fit(_train['sentenceList']) # _vocab = {j:i for i,j in enumerate(set(_vocab.keys()).union(set(_vec.vocabulary_)))} _vec = TfidfVectorizer(vocabulary=_vocab) _vec.fit(_train['sentenceList']) assert _vec.vocabulary_ == _vocab print('特征维度:', len(_vocab.keys())) train_csr = _vec.transform(_train['sentenceList']) test_csr = _vec.transform(_test['sentenceList']) _train.drop('sentenceList', axis=1, inplace=True) _test.drop('sentenceList', axis=1, inplace=True) _train = _train.reset_index().drop('index', axis=1) # model = CalibratedClassifierCV(svm.LinearSVC(random_state=2018)) model = CalibratedClassifierCV( lgb.LGBMClassifier(metric='auc', learning_rate=0.02)) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018) models = [] y_preds, y_trues = [], [] for train, test in skf.split(range(len(_train.index)), _train['label']): y_pred = 0 model.fit(train_csr[train], _train.iloc[train]['label']) models.append(model) y_pred += np.array(model.predict_proba(train_csr[test]))[:, 1] y_preds.extend(np.array(y_pred).round()) y_trues.extend(_train.iloc[test]['label']) print('五折交叉的结果:') printMark(y_trues, y_preds) del (y_trues, y_preds, train, test) y_pred = 0 for i in range(len(models)): y_pred += np.array(models[i].predict_proba(test_csr))[:, 1] / len(models) print('测试集的结果:') save_errorcase(_test.index, _test['label'], y_pred, rule) printMark(_test['label'], y_pred.round())
def test_calibration(): """Test calibration objects with isotonic and sigmoid""" n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: for method in ['isotonic', 'sigmoid']: pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training # set pc_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_pc_clf_relabeled = \ pc_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled)) # check that calibration can also deal with regressors that have # a decision_function clf_base_regressor = CalibratedClassifierCV(Ridge()) clf_base_regressor.fit(X_train, y_train) clf_base_regressor.predict(X_test) # Check failure cases: # only "isotonic" and "sigmoid" should be accepted as methods clf_invalid_method = CalibratedClassifierCV(clf, method="foo") assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train) # base-estimators should provide either decision_function or # predict_proba (most regressors, for instance, should fail) clf_base_regressor = \ CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid") assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
clf = ExtraTreesClassifier(random_state=1729, bootstrap=True, class_weight="balanced") selector = clf.fit(normalize(X), y) # clf.feature_importances_ fs = SelectFromModel(selector, prefit=True) X = fs.transform(X) test = fs.transform(test) print(X.shape, test.shape) #m2_xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, \ #seed=1729) m2_xgb = xgb.XGBClassifier(missing=np.nan, max_depth=6, n_estimators=350, learning_rate=0.025, nthread=4, subsample=0.95, colsample_bytree=0.85, seed=4242) metLearn = CalibratedClassifierCV(m2_xgb, method='isotonic', cv=10) metLearn.fit(X, y) ## # Submission probs = metLearn.predict_proba(test) submission = pd.DataFrame({"ID": test_id, "TARGET": probs[:, 1]}) submission.to_csv("submission.csv", index=False)
# Generate the train set with the rest of the data. train_data = data[test_cutoff:] train_label = labels[test_cutoff:] #KNN Classifier neigh = KNeighborsClassifier(n_neighbors=5, algorithm='auto', metric='minkowski', p=1) neigh.fit(train_data, train_label) predictions_knn = neigh.predict(test_data) #SVM Classifier svc = svm.LinearSVC(random_state=0) svc = OneVsRestClassifier(svc) clf = CalibratedClassifierCV(svc, cv=10) clf.fit(train_data, train_label) predictions_svm = clf.predict(test_data) #Decision Tree Classifier clf = tree.DecisionTreeClassifier() clf = clf.fit(train_data, train_label) predictions_decision = clf.predict(test_data) #Neural Network Classifier clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, ), random_state=1, activation='tanh') clf.fit(train_data, train_label)
score = metrics.log_loss(my_test["OutcomeType"], probs) print(score) alg10 = GaussianNB() alg10.fit(train[features], train["OutcomeType"]) probs = alg10.predict_proba(my_test[features]) score = metrics.log_loss(my_test["OutcomeType"], probs) print(score) alg2 = RandomForestClassifier() alg2.fit(train[features], train["OutcomeType"]) probs = alg2.predict_proba(my_test[features]) score = metrics.log_loss(my_test["OutcomeType"], probs) print(score) alg7 = CalibratedClassifierCV() alg7.fit(train[features], train["OutcomeType"]) probs = alg7.predict_proba(my_test[features]) score = metrics.log_loss(my_test["OutcomeType"], probs) print(score) alg6 = DecisionTreeClassifier() alg6.fit(train[features], train["OutcomeType"]) probs = alg6.predict_proba(my_test[features]) score = metrics.log_loss(my_test["OutcomeType"], probs) print(score) alg4 = AdaBoostClassifier() alg4.fit(train[features], train["OutcomeType"]) probs = alg4.predict_proba(my_test[features]) score = metrics.log_loss(my_test["OutcomeType"], probs)
def test_calibration_multiclass(method, ensemble, seed): def multiclass_brier(y_true, proba_pred, n_classes): Y_onehot = np.eye(n_classes)[y_true] return np.sum((Y_onehot - proba_pred)**2) / Y_onehot.shape[0] # Test calibration for multiclass with classifier that implements # only decision function. clf = LinearSVC(random_state=7) X, y = make_blobs(n_samples=500, n_features=100, random_state=seed, centers=10, cluster_std=15.0) # Use an unbalanced dataset by collapsing 8 clusters into one class # to make the naive calibration based on a softmax more unlikely # to work. y[y > 2] = 2 n_classes = np.unique(y).shape[0] X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf.fit(X_train, y_train) cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble) cal_clf.fit(X_train, y_train) probas = cal_clf.predict_proba(X_test) # Check probabilities sum to 1 assert_allclose(np.sum(probas, axis=1), np.ones(len(X_test))) # Check that the dataset is not too trivial, otherwise it's hard # to get interesting calibration data during the internal # cross-validation loop. assert 0.65 < clf.score(X_test, y_test) < 0.95 # Check that the accuracy of the calibrated model is never degraded # too much compared to the original classifier. assert cal_clf.score(X_test, y_test) > 0.95 * clf.score(X_test, y_test) # Check that Brier loss of calibrated classifier is smaller than # loss obtained by naively turning OvR decision function to # probabilities via a softmax uncalibrated_brier = \ multiclass_brier(y_test, softmax(clf.decision_function(X_test)), n_classes=n_classes) calibrated_brier = multiclass_brier(y_test, probas, n_classes=n_classes) assert calibrated_brier < 1.1 * uncalibrated_brier # Test that calibration of a multiclass classifier decreases log-loss # for RandomForestClassifier clf = RandomForestClassifier(n_estimators=30, random_state=42) clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) uncalibrated_brier = multiclass_brier(y_test, clf_probs, n_classes=n_classes) cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble) cal_clf.fit(X_train, y_train) cal_clf_probs = cal_clf.predict_proba(X_test) calibrated_brier = multiclass_brier(y_test, cal_clf_probs, n_classes=n_classes) assert calibrated_brier < 1.1 * uncalibrated_brier
vec = TfidfVectorizer(ngram_range=(1,wins), min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1) kfold_x_train = train[column][train_index] kfold_x_valid = train[column][test_index] k_y_train = (train['class']-1).astype(int)[train_index] k_y_valid = (train['class']-1).astype(int)[test_index] print('获得tfidf特征') k_trn_term_doc = vec.fit_transform(kfold_x_train) k_test_term_doc = vec.transform(kfold_x_valid) test_term_doc = vec.transform(test[column]) # 拟合数据 print('拟合数据') lin_clf = svm.LinearSVC() lin_clf = CalibratedClassifierCV(lin_clf) lin_clf.fit(k_trn_term_doc, k_y_train) # 预测结果 print('预测结果') oof_predict[test_index] += lin_clf.predict_proba(k_test_term_doc) / n_folds predict += lin_clf.predict_proba(test_term_doc) / n_folds # 计算准确度 p_l = [] for row in oof_predict[test_index]: p_l.append(np.argmax(eval(row))) accuracy = accuracy_score(p_l, k_y_valid.values) f1 = f1_score(p_l, k_y_valid.values)
import pickle from sklearn.preprocessing import normalize from sklearn.svm import LinearSVC from sklearn.calibration import CalibratedClassifierCV import numpy as np iter = 5 with open('ListOfBestParamsRS.pkl', 'rb') as f: best_params = pickle.load(f) path = "C://Users//Arushi//PycharmProjects//ThesisChap2//ClusteringBuckets//" for i in range(iter): X_train = np.load(path + 'final_train_binarydata_' + str(i) + '.npy') Y_train = np.load(path + 'final_train_labels_' + str(i) + '.npy') bp = best_params[i] X_train = X_train.astype('float') X_train = normalize(X_train) Y_train = Y_train.astype('float') Y_train = Y_train.astype(int) clf = LinearSVC(C=bp['C'], max_iter=10000, tol=1e-4) clf_sigmoid = CalibratedClassifierCV(clf, cv=4, method='sigmoid').fit( X_train, Y_train.ravel()) with open('Model_ism_linear' + str(i) + '.pkl', 'wb') as f: pickle.dump(clf_sigmoid, f)
'ngram_range': (1, 2) }, { 'max_features': 3200, 'ngram_range': (1, 1) }, { 'max_features': 3200, 'ngram_range': (1, 1) }, { 'max_features': 3200, 'ngram_range': (1, 1) }] models = [ LogisticRegression(**model_params[0]), RandomForestClassifier(**model_params[1]), CalibratedClassifierCV( base_estimator=SGDClassifier(**model_params[2], max_iter=250)), MultinomialNB(**model_params[3]) ] dataframe = pd.read_table(file) col = ['tweetID', 'text', 'relevant'] df = dataframe[col] print(df.info()) x = df.text y = df.relevant X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42, stratify=y)
kf = KFold(n_splits=5) fold_count = 0 for train_index, test_index in kf.split(one_hot_encoding_df): print("Training Data: %d, Testing Data: %d" % (len(train_index), len(test_index))) train_X = one_hot_encoding_df.iloc[train_index, one_hot_encoding_df.columns != 'Attrition'] train_y = one_hot_encoding_df.iloc[train_index]["Attrition"] test_X = one_hot_encoding_df.iloc[test_index, one_hot_encoding_df.columns != 'Attrition'] test_y = one_hot_encoding_df.iloc[test_index]["Attrition"] ## model #clf = GaussianNB() clf = DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=20) model_isotonic = CalibratedClassifierCV(clf, cv=10, method='isotonic') #model_sigmoid = CalibratedClassifierCV(clf, cv=4, method='sigmoid') model = model_isotonic.fit(train_X, train_y) test_predict = model.predict(test_X) #avg_feature_importance.append(model.feature_importances_) acc, precision, recall, f1, matrix = evaluation(test_y, test_predict) print("Fold: %d, Accuracy: %f, Precision: %f, Recall: %f, F1: %f" % (fold_count + 1, round(acc, 3), round(precision, 3), round(recall, 3), round(f1, 3))) avg_acc += acc avg_precision += precision avg_recall += recall avg_f1 += f1 avg_confusion_matrix.append(matrix) fold_count += 1
# default parameters # SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, # shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, # class_weight=None, warm_start=False, average=False, n_iter=None) # some of methods # fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent. # predict(X) Predict class labels for samples in X. ############################################################################### log_error_array = [] for i in alpha: clf = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42) clf.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_test) log_error_array.append( log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) print('For values of alpha = ', i, "The log loss is:", log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) ############################################################################### fig, ax = plt.subplots() ax.plot(alpha, log_error_array, c='g') for i, txt in enumerate(np.round(log_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha")
roc_accuracy_8 = 0; p1_8 = 0 p0_8= 0 accuracy_9per = 0; apc_9per = 0 roc_accuracy_9per = 0; p1_9per = 0 p0_9per= 0 accuracy_9link = 0; apc_9link = 0 roc_accuracy_9link = 0; p1_9link = 0 p0_9link= 0 lin_clf = svm.LinearSVC(class_weight='balanced',dual=False, max_iter=100, random_state= 42949694) mci = 0 clf = CalibratedClassifierCV(lin_clf) for i in range(0,10): for j in range(0,5): train = pd.read_csv('fb'+str(i)+'train'+str(j)+'class_persistency.csv',delimiter='\t') test = pd.read_csv('fb'+str(i)+'test'+str(j)+'class_persistency.csv',delimiter='\t') Y_tr = train['status_class'].values del train['user1'] del train['user2'] del train['time'] del train['status'] ## del train['class'] del train['status_class'] X_tr = train.values; Y_te_st = test['status_class'].values; Y_te_per = test['class_per'].values Y_te_li = test['class'].values
LR1 = LogisticRegression(penalty='l1', tol=0.01) LR2 = LogisticRegression(penalty='l2', tol=0.01) DT = DecisionTreeClassifier(random_state=0, max_depth=15, min_samples_leaf=2) RF = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=100, random_state=1, verbose=True) NN40 = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(40,), random_state=1, activation='relu', verbose=True, max_iter=20) NN1600 = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(1600,), random_state=1, activation='relu', verbose=True, max_iter=20) MLPclf = MLPClassifier(activation='relu', learning_rate='constant', alpha=1e-4, hidden_layer_sizes=(80, 40), random_state=1, batch_size=1, verbose=False, max_iter=20, warm_start=True) clf = xgb.XGBClassifier() metLearn = CalibratedClassifierCV(clf, method='isotonic', cv=2) leanerSVML1 = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, random_state=0) leanerSVML2 = LinearSVC(penalty='l2', loss='hinge', dual=True, random_state=0) clf = svm.SVC(probability=True, verbose=True) eclf1 = VotingClassifier(estimators=[('lr2', LR2), ('leanerSVML2', leanerSVML2), ('DT', DT)], voting='hard') kf = KFold(n_splits=10, random_state=None, shuffle=False) X = x_train.values y = y_train.values
def test_calibration(data, method, ensemble): # Test calibration objects with isotonic and sigmoid n_samples = 100 X, y = data sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble) with pytest.raises(ValueError): cal_clf.fit(X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble) # Note that this fit overwrites the fit on the entire training # set cal_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_cal_clf = cal_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss( y_test, prob_pos_cal_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] cal_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] cal_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] cal_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_cal_clf, 1 - prob_pos_cal_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss( (y_test + 1) % 2, prob_pos_cal_clf_relabeled))
dim=obd.dim_context, random_state=random_state, ) if counterfactual_policy != "logistic_ts": kwargs["epsilon"] = epsilon policy = counterfactual_policy_dict[counterfactual_policy](**kwargs) policy_name = f"{policy.policy_name}_{context_set}" # obtain batch logged bandit feedback generated by behavior policy bandit_feedback = obd.obtain_batch_bandit_feedback() # ground-truth policy value of the random policy # , which is the empirical mean of the factual (observed) rewards (on-policy estimation) ground_truth = bandit_feedback["reward"].mean() # a base ML model for regression model used in Direct Method and Doubly Robust base_model = CalibratedClassifierCV( HistGradientBoostingClassifier(**hyperparams)) # run a counterfactual bandit algorithm on logged bandit feedback data selected_actions = run_bandit_simulation(bandit_feedback=bandit_feedback, policy=policy) # estimate the policy value of a given counterfactual algorithm by the three OPE estimators. ope = OffPolicyEvaluation( bandit_feedback=bandit_feedback, regression_model=RegressionModel(base_model=base_model), action_context=obd.action_context, ope_estimators=[ InverseProbabilityWeighting(), DirectMethod(), DoublyRobust() ], ) estimated_policy_value, estimated_interval = ope.summarize_off_policy_estimates(
clf_probs = clf.predict_proba(X_test) score = log_loss(y_test, clf_probs) clf = DecisionTreeClassifier( criterion='entropy', min_samples_split=5, max_depth=40, max_features=30, random_state=2602, ) # Train random forest classifier, calibrate on validation data and evaluate # on test data clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit") sig_clf.fit(train, target) sig_clf_probs = sig_clf.predict_proba(X_test) sig_score = log_loss(y_test, sig_clf_probs) print('\n-----------------------') print(' logloss train: %.5f' % score) print(' logloss valid: %.5f' % sig_score) print('-----------------------') # param_grid = { # 'n_estimators': [10], # 'max_features': ['auto', 2, 30], # 'min_samples_leaf': [2, 8], # 'max_leaf_nodes': [2, 8], # 'min_samples_split': [2, 5],
X_train_valid, y_train_valid = X[:800], y[:800] X_test, y_test = X[800:], y[800:] # Train uncalibrated random forest classifier on whole train and validation # data and evaluate on test data clf = RandomForestClassifier(n_estimators=25) clf.fit(X_train_valid, y_train_valid) clf_probs = clf.predict_proba(X_test) score = log_loss(y_test, clf_probs) # Train random forest classifier, calibrate on validation data and evaluate # on test data clf = RandomForestClassifier(n_estimators=25) clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit") sig_clf.fit(X_valid, y_valid) sig_clf_probs = sig_clf.predict_proba(X_test) sig_score = log_loss(y_test, sig_clf_probs) # Plot changes in predicted probabilities via arrows plt.figure() colors = ["r", "g", "b"] for i in range(clf_probs.shape[0]): plt.arrow(clf_probs[i, 0], clf_probs[i, 1], sig_clf_probs[i, 0] - clf_probs[i, 0], sig_clf_probs[i, 1] - clf_probs[i, 1], color=colors[y_test[i]], head_width=1e-2)
def BuildModel_Apply_Performance(df, clf, cv_num, df_notSel, apply_unk, df_unknowns, test_df, classes, POS, NEG, j, ALG, THRSHD_test, save): from sklearn.model_selection import cross_val_predict # Data from balanced dataframe y = df['Class'] X = df.drop(['Class'], axis=1) # For LinearSVM need to have calibrated classifier to get probability # scores, but not for importance scores if ALG.lower() == 'svm': from sklearn.calibration import CalibratedClassifierCV clf2 = clf clf2.fit(X,y) # adds the probability output to linearSVC clf = CalibratedClassifierCV(clf, cv=3) else: clf2 = 'pass' # Obtain the predictions using 10 fold cross validation # (uses KFold cv by default): cv_proba = cross_val_predict(estimator=clf, X=X, y=y, cv=int(cv_num), method='predict_proba') cv_pred = cross_val_predict(estimator=clf, X=X, y=y, cv=cv_num) # Fit a model using all data and apply to # (1) instances that were not selected using cl_train # (2) instances with unknown class # (3) test instances clf.fit(X,y) # Save model for future persistence print(f'\nSaving model as {save+".joblib"}\n') dump(clf, save+'.joblib') notSel_proba = clf.predict_proba(df_notSel.drop(['Class'], axis=1)) if apply_unk == True: unk_proba = clf.predict_proba(df_unknowns.drop(['Class'], axis=1)) if not isinstance(test_df, str): test_proba = clf.predict_proba(test_df.drop(['Class'], axis=1)) test_pred = clf.predict(test_df.drop(['Class'], axis=1)) # Evaluate performance if len(classes) == 2: i = 0 for clss in classes: if clss == POS: POS_IND = i break i += 1 scores = cv_proba[:, POS_IND] # Generate run statistics from balanced dataset scores result = fun.Performance(y, cv_pred, scores, clf, clf2, classes, POS, POS_IND, NEG, ALG, THRSHD_test) #Generate data frame with all scores score_columns=["score_%s"%(j)] df_sel_scores = pd.DataFrame(data=cv_proba[:, POS_IND], index=df.index, columns=score_columns) df_notSel_scores = pd.DataFrame(data=notSel_proba[:,POS_IND], index=df_notSel.index, columns=score_columns) current_scores = pd.concat([df_sel_scores, df_notSel_scores], axis=0) if apply_unk == True: df_unk_scores = pd.DataFrame(data=unk_proba[:, POS_IND], index=df_unknowns.index, columns=score_columns) current_scores = pd.concat([current_scores,df_unk_scores], axis=0) if not isinstance(test_df, str): df_test_scores = pd.DataFrame(data=test_proba[:,POS_IND], index=test_df.index, columns=score_columns) current_scores = pd.concat([current_scores, df_test_scores], axis=0) scores_test = test_proba[:,POS_IND] result_test = fun.Performance(test_df['Class'], test_pred, scores_test, clf, clf2, classes, POS, POS_IND, NEG, ALG, THRSHD_test) else: # Generate run statistics from balanced dataset scores result = fun.Performance_MC(y, cv_pred, classes) #Generate data frame with all scores score_columns = [] for clss in classes: score_columns.append("%s_score_%s"%(clss, j)) df_sel_scores = pd.DataFrame(data=cv_proba, index=df.index, columns=score_columns) df_notSel_scores = pd.DataFrame(data=notSel_proba, index=df_notSel.index, columns=score_columns) current_scores = pd.concat([df_sel_scores, df_notSel_scores], axis=0) if apply_unk: df_unk_scores = pd.DataFrame(data=unk_proba, index=df_unknowns.index, columns=score_columns) current_scores = pd.concat([current_scores, df_unk_scores], axis=0) if not isinstance(test_df, str): df_test_scores = pd.DataFrame(data=test_proba, index=test_df.index, columns=score_columns) current_scores = pd.concat([current_scores, df_test_scores], axis=0) result_test = fun.Performance_MC(test_df['Class'], test_pred, classes) if not isinstance(test_df, str): return result,current_scores,result_test else: return result,current_scores