def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def main(): #1,加载数据(训练和测试)和预处理数据 #将NumberTime30-59,60-89,90中标记的96,98替换为NaN #将Age中的0替换为NaN colnames = [ 'ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', 'NORealEstate', 'NOTime60-89', 'NODependents' ] col_nas = [ '', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA', [98, 96], 'NA' ] col_na_values = creatDictKV(colnames, col_nas) dftrain = pd.read_csv("./data/cs-training.csv", names=colnames, na_values=col_na_values, skiprows=[0]) train_id = [int(x) for x in dftrain.pop("ID")] y_train = np.asarray([int(x) for x in dftrain.pop("label")]) x_train = dftrain.as_matrix() dftest = pd.read_csv("./data/cs-test.csv", names=colnames, na_values=col_na_values, skiprows=[0]) test_id = [int(x) for x in dftest.pop("ID")] y_test = np.asarray(dftest.pop("label")) x_test = dftest.as_matrix() #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0) for train_index, test_index in sss.split(x_train, y_train): print("TRAIN:", train_index, "TEST:", test_index) x_train_new, x_test_new = x_train[train_index], x_train[test_index] y_train_new, y_test_new = y_train[train_index], y_train[test_index] y_train = y_train_new x_train = x_train_new #3,使用Imputer将NaN替换为平均值 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(x_train) x_train = imp.transform(x_train) x_test_new = imp.transform(x_test_new) x_test = imp.transform(x_test) x_train = np.delete(x_train, 5, axis=1) x_test_new = np.delete(x_test_new, 5, axis=1) if not os.path.isfile("lr_model.m"): clf = LogisticRegression(class_weight="balanced") clf.fit(x_train, y_train) joblib.dump(clf, "lr_model.m") predicted_probs_train = clf.predict_proba(x_train) predicted_probs_train = [x[1] for x in predicted_probs_train] computeAUC(y_train, predicted_probs_train) else: clf = joblib.load("lr_model.m") predicted_probs_test_new = clf.predict_proba(x_test_new) predicted_probs_test_new = [x[1] for x in predicted_probs_test_new] computeAUC(y_test_new, predicted_probs_test_new)
def lr_training_and_test(X_train, X_test, y_train, y_test): print 'model: logistic regression.' model = LogisticRegression() model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) y_train_pred_prob = model.predict_proba(X_train)[:, 1] y_test_pred_prob = model.predict_proba(X_test)[:, 1] evaluate_model(y_train, y_train_pred, y_train_pred_prob, y_test, y_test_pred, y_test_pred_prob) return model
class LogReg: def __init__(self): self.load_data() self.clf = LogisticRegression(class_weight = 'balanced') self.train() self.predict() def load_data(self): train_csv = './data/train.csv' test_csv = './data/test.csv' df_train = pd.read_csv(train_csv, header=0) df_test = pd.read_csv(test_csv, header=0) arr_train = df_train.values arr_test = df_test.values self.train_X = arr_train[0::,1::] self.train_Y = arr_train[0::, 0] self.test_X = arr_test[0::, 1::] self.test_ID = arr_test[0::,0] def train(self): self.clf.fit(self.train_X, self.train_Y) def predict(self): self.test_Y = self.clf.predict_proba(self.test_X) def get_training_accuracy(self): return (self.clf.score(self.train_X, self.train_Y)) def store_result(self): df_out = pd.DataFrame() df_out['Id'] = self.test_ID df_out['Action'] = self.test_Y[0::,1] df_out.to_csv('./data/results/c1_result.csv',index=False)
def get_second_level(train_dim, train_label, test_dim, num_class): meta_train, meta_test = get_first_level(train_dim, train_label, test_dim, num_class) LR = LogisticRegression() hist = LR.fit(meta_train, train_label) pre_score = LR.predict_proba(meta_test) return meta_train, meta_test, pre_score
def logic_pca_standard(y, n): # 逻辑回归+降维+标准化 pa = PCA(n_components=n) data = pa.fit_transform(train) # 分割数据 x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=24) # 标准化 std = StandardScaler() print(std) x_train = std.fit_transform(x_train) x_test = std.transform(x_test) # estimator logic = LogisticRegression() logic.fit(x_train, y_train) # 预测 pre_score = logic.score(x_test, y_test) print("准确率(逻辑回归+降维+标准化):{}".format(pre_score)) print( "精确率和召回率:", classification_report(y_test, logic.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) # 输出概率 predictions = logic.predict_proba(x_test) # Compute Receiver operating characteristic (ROC) fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value))
class Ensemble: def __init__(self, base_estimators=None, random_state=0): self.base_estimators = base_estimators self.estimator = MetaEstimator() self.random_state = random_state def fit(self, X, y): cv = KFold(n_splits=5, shuffle=True, random_state=self.random_state) predictions = [] for estimator in self.base_estimators: prediction = cross_val_predict(estimator, X, y, cv=cv, method='predict_proba') print('prediction of', estimator.__class__.__name__) print(prediction) # predictions.extend(prediction.T) predictions.append(prediction.T[0]) print('all predictions') print(np.array(predictions), y) self.estimator.fit(np.array(predictions).T, y) for estimator in self.base_estimators: estimator.fit(X, y) def predict(self, X, margin): return np.array(self.predict_proba(X)) > margin def predict_proba(self, X): predictions = [] for estimator in self.base_estimators: # predictions.extend(estimator.predict_proba(X).T) predictions.append(estimator.predict_proba(X).T[0]) return self.estimator.predict_proba(np.array(predictions).T)
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold") parser.add_argument('--threshold',type=float,default=0.5) parser.add_argument('--annotator',type=str,default="03") parser.add_argument('--penalty',type=str,choices=["l1","l2"],default="l1") args = parser.parse_args() current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+args.annotator+".lbl.conll" testfile = scriptdir+"/../data/cwi_testing/cwi_testing.txt.lbl.conll" X__dict_train, y_train, v_train = feats_and_classify.collect_features(current_single_ann,vectorize=False) X_dict_test, y_test, v_test = feats_and_classify.collect_features(testfile,vectorize=False) featdicts = list([x for x in X__dict_train + X_dict_test]) vect = DictVectorizer() X = vect.fit_transform(featdicts).toarray() X_train=X[:len(y_train)] X_test=X[len(y_train):] maxent = LogisticRegression(penalty=args.penalty) maxent.fit(X_train,y_train) y_pred_proba = maxent.predict_proba(X_test) ypred_i=["1" if pair[1]>=args.threshold else "0" for pair in y_pred_proba] fout = open(args.annotator+".pred",mode="w") print("\n".join(ypred_i),file=fout) fout.close() sys.exit(0)
class LogisticRegressionImpl(): def __init__(self, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=None): self._hyperparams = { 'penalty': penalty, 'dual': dual, 'tol': tol, 'C': C, 'fit_intercept': fit_intercept, 'intercept_scaling': intercept_scaling, 'class_weight': class_weight, 'random_state': random_state, 'solver': solver, 'max_iter': max_iter, 'multi_class': multi_class, 'verbose': verbose, 'warm_start': warm_start, 'n_jobs': n_jobs} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def test_logreg_predict_proba_multinomial(): X, y = make_classification(n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10) # Predicted probabilites using the true-entropy loss should give a # smaller loss than those using the ovr method. clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs") clf_multi.fit(X, y) clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs") clf_ovr.fit(X, y) clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X)) assert_greater(clf_ovr_loss, clf_multi_loss) # Predicted probabilites using the soft-max function should give a # smaller loss than those using the logistic function. clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X)) assert_greater(clf_wrong_loss, clf_multi_loss)
def get_second_level(train_dim, train_label, test_dim, num_class): meta_train, meta_test = get_first_level(train_dim, train_label, test_dim, num_class) meta_train_fusion = np.concatenate((meta_train, train_dim), axis=1) meta_test_fusion = np.concatenate((meta_test, test_dim), axis=1) LR = LogisticRegression(C=0.03125, penalty="l1") hist = LR.fit(meta_train_fusion, train_label) pre_score = LR.predict_proba(meta_test_fusion) return meta_train_fusion, meta_test_fusion, pre_score
def main(): train_data, test_data = load_data() train_data, test_data = data_fillna(train_data, test_data) train_data, test_data = data_process(train_data, test_data) #特征选择 features = [ 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'My' ] train_data['My'] = train_data['Age'] + train_data['Sex'] test_data['My'] = test_data['Age'] + test_data['Sex'] train_labels = train_data['Survived'] train_features = train_data[features] train_x, train_y, label_x, label_y = train_test_split(train_features, train_labels, test_size=0.3, random_state=1) test_features = test_data[features] LR = LogisticRegression(max_iter=100, verbose=True, random_state=33, tol=1e-4) LR.fit(train_x, label_x) predict = LR.predict_proba(train_y)[:, 1] feature_importance = LR.coef_[0] feature_importance = 100.0 * (feature_importance / feature_importance.max()) print('feature importance is:') print(feature_importance) print("LR auc:%0.6lf" % metrics.roc_auc_score(label_y, predict)) SVM = SVC(kernel='rbf', probability=True, C=0.2) SVM.fit(train_x, label_x) predict_svm = SVM.predict_proba(train_y)[:, 1] print("SVM auc:%0.6lf" % metrics.roc_auc_score(label_y, predict_svm)) LGB = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metric='auc', verbose=0, learning_rate=0.01, num_leaves=31, feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=2, lambda_l1=0.8, lambda_l2=0, max_depth=5, # silent = False cat_smooth=1) LGB.fit(train_x, label_x) predict_LGB = LGB.predict_proba(train_y)[:, 1] lgb.plot_importance(LGB, max_num_features=30) print("LGB auc:%0.6lf" % metrics.roc_auc_score(label_y, predict))
def test_logreg_predict_proba_multinomial(): X, y = make_classification(n_samples=10, n_features=20, random_state=0, n_classes=3, n_informative=10) # Predicted probabilities using the true-entropy loss should give a # smaller loss than those using the ovr method. clf_multi = LogisticRegression(multi_class="multinomial", solver="lbfgs") clf_multi.fit(X, y) clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_ovr = LogisticRegression(multi_class="ovr", solver="lbfgs") clf_ovr.fit(X, y) clf_ovr_loss = log_loss(y, clf_ovr.predict_proba(X)) assert_greater(clf_ovr_loss, clf_multi_loss) # Predicted probabilities using the soft-max function should give a # smaller loss than those using the logistic function. clf_multi_loss = log_loss(y, clf_multi.predict_proba(X)) clf_wrong_loss = log_loss(y, clf_multi._predict_proba_lr(X)) assert_greater(clf_wrong_loss, clf_multi_loss)
def test_nnet(n_samples=200, n_features=5, distance=0.5, complete=False): X, y = make_blobs(n_samples=n_samples, n_features=5, centers=[ numpy.ones(n_features) * distance, -numpy.ones(n_features) * distance ]) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MLPClassifier, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] if complete: # checking all possible combinations for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42) nn.fit(X, y, epochs=100) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1, "Let's see and compare results" else: # checking combinations of losses, nn_types, trainers, most of them are used once during tests. attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types)) attempts = 4 losses_shift = numpy.random.randint(10) trainers_shift = numpy.random.randint(10) for attempt in range(attempts): loss = nnet.losses.keys()[(attempt + losses_shift) % len(nnet.losses)] trainer = nnet.trainers.keys()[(attempt + trainers_shift) % len(nnet.trainers)] nn_type = nn_types[attempt % len(nn_types)] nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42) print(nn) nn.fit(X, y, epochs=200) assert roc_auc_score(y, nn.predict_proba(X)[:, 1]) > 0.8, \ 'quality of model is too low: {}'.format(nn)
def compare_nnets_quality(n_samples=200, n_features=7, distance=0.8): X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) # checking all possible combinations for loss in ['log_loss']: # nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, epochs=100, random_state=42) nn.fit(X, y) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(roc_auc_score(y, lr.predict_proba(X)[:, 1]), lr)
def test_nnet(n_samples=200, n_features=7, distance=0.8, complete=False): """ :param complete: if True, all possible combinations will be checked, and quality is printed """ X, y = generate_sample(n_samples=n_samples, n_features=n_features, distance=distance) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MLPClassifier, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] if complete: # checking all possible combinations for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42, epochs=100) nn.fit(X, y ) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1, "Let's see and compare results" else: # checking combinations of losses, nn_types, trainers, most of them are used once during tests. attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types)) losses_shift = numpy.random.randint(10) trainers_shift = numpy.random.randint(10) for attempt in range(attempts): # each combination is tried 3 times. before raising exception retry_attempts = 3 for retry_attempt in range(retry_attempts): loss = list(nnet.losses.keys())[(attempt + losses_shift) % len(nnet.losses)] trainer = list(nnet.trainers.keys())[(attempt + trainers_shift) % len(nnet.trainers)] nn_type = nn_types[attempt % len(nn_types)] nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42 + retry_attempt, epochs=200) print(nn) nn.fit(X, y) quality = roc_auc_score(y, nn.predict_proba(X)[:, 1]) computed_loss = nn.compute_loss(X, y) if quality > 0.8: break else: print('attempt {} : {}'.format(retry_attempt, quality)) if retry_attempt == retry_attempts - 1: raise RuntimeError('quality of model is too low: {} {}'.format(quality, nn))
def classifierPrecission(): import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.metrics import roc_curve, auc df = pd.read_csv('data/sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split \ (df['message'],df['label']) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision') print 'precission:', np.mean(precisions), precisions recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall') print 'recalls:', np.mean(recalls), recalls #f1 = 2*PR/(P+R) for perfect should be 1 f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1') print 'f1s:', np.mean(f1s), f1s #ROC Receiver operating characteristic ROC Currve clasisfier performance #its classifier recall against its fall-out #F = FP /(TN + FP) predictions = classifier.predict_proba(X_test) false_positive_rate, recall, thresholds = roc_curve( y_test, predictions[:, 1]) roc_auc = auc(false_positive_rate, recall) plt.title('ROC') plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('Recall') plt.xlabel('fall-out') plt.show()
def fit_model_2(self, lol = .07, toWrite = False): model = LogisticRegression(C = lol, penalty = 'l1', tol = 1e-6) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data X_train,Y_train = self.balance_data(X_train,Y_train) model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 2 Score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model2/model.pkl','w') pickle.dump(model,f2) f2.close()
def test_multinomial_binary_probabilities(): # Test multinomial LR gives expected probabilities based on the # decision function, for a binary problem. X, y = make_classification() clf = LogisticRegression(multi_class='multinomial', solver='saga') clf.fit(X, y) decision = clf.decision_function(X) proba = clf.predict_proba(X) expected_proba_class_1 = (np.exp(decision) / (np.exp(decision) + np.exp(-decision))) expected_proba = np.c_[1-expected_proba_class_1, expected_proba_class_1] assert_almost_equal(proba, expected_proba)
def test_predict_iris(): """Test logistic regression with the iris dataset""" n_samples, n_features = iris.data.shape target = iris.target_names[iris.target] clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target) assert_array_equal(np.unique(target), clf.classes_) pred = clf.predict(iris.data) assert_greater(np.mean(pred == target), .95) probabilities = clf.predict_proba(iris.data) assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) pred = iris.target_names[probabilities.argmax(axis=1)] assert_greater(np.mean(pred == target), .95)
def train_and_predics(featuredicts, labels, trainsize): vec = DictVectorizer() y_train = labels[:trainsize] X_train = vec.fit_transform(featuredicts[:trainsize]) X_test = vec.transform(featuredicts[trainsize:]) maxent = LogisticRegression(penalty='l2') maxent.fit(X_train, y_train) predictions = [] #header = "\t".join(["prediction"]+[str(c) for c in maxent.classes_]) #predictions.append(header) for list, label in zip(maxent.predict_proba(X_test), maxent.predict(X_test)): line = "\t".join([label] + ["{0:.2f}".format(k) for k in list]) predictions.append(line) return predictions
def test_nnet(n_samples=200, n_features=5, distance=0.5, complete=False): X, y = make_blobs( n_samples=n_samples, n_features=5, centers=[numpy.ones(n_features) * distance, -numpy.ones(n_features) * distance], ) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MultiLayerNetwork, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] if complete: # checking all possible combinations for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42) nn.fit(X, y, epochs=100) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1, "Let's see and compare results" else: # checking combinations of losses, nn_types, trainers, most of them are used once during tests. attempts = max(len(nnet.losses), len(nnet.trainers), len(nn_types)) attempts = 4 losses_shift = numpy.random.randint(10) trainers_shift = numpy.random.randint(10) for attempt in range(attempts): loss = nnet.losses.keys()[(attempt + losses_shift) % len(nnet.losses)] trainer = nnet.trainers.keys()[(attempt + trainers_shift) % len(nnet.trainers)] nn_type = nn_types[attempt % len(nn_types)] nn = nn_type(layers=[5], loss=loss, trainer=trainer, random_state=42) print(nn) nn.fit(X, y, epochs=200) assert roc_auc_score(y, nn.predict_proba(X)[:, 1]) > 0.8, "quality of model is too low: {}".format(nn)
class WebsiteMatchConfidencePredictor(object): def __init__(self): self.model = LogisticRegression() def fit(self, urls, websites, y): """ :param urls: list of urls :param websites: list of corresponding scraped websites :param y: list of corresponding booleans - matches or not """ X = [make_features(url, web) for url, web in zip(urls, websites)] self.model.fit(X, y) def predict(self, url, website): X = make_features(url, website) return self.model.predict_proba(X)
def predictTestSet(): #generate training features and labels trainfile='/home/natschluter/GroupAlgorithms/cwi2016/data/cwi_training/cwi_training_cat.lbl.conll' trainfeatures, trainlabels, vec = feats_and_classify_py2.collect_features(trainfile) #generate training+test features bothfiles='/home/natschluter/GroupAlgorithms/cwi2016/data/train_and_test1.conll' bothfeatures, bothlabels, bothvec = feats_and_classify_py2.collect_features(bothfiles) thresholds_med=np.median(np.array([ 0.145, 0.85, 0.12, 0.657, 0.71, 0.824, 0.506, 0.461, 0.662, 0.888])) TrainX=bothfeatures[np.array(range(len(trainfeatures)))] TrainY=bothlabels[np.array(range(len(trainlabels)))] TestX=bothfeatures[np.array(range(len(trainfeatures),len(bothfeatures)))] maxent = LogisticRegression(penalty='l2') print('training...') maxent.fit(TrainX,TrainY) print('predicting...') ypred_probs=maxent.predict_proba(TestX)
class Ensemble: def __init__(self, base_estimators=None, random_state=0, cv=3): self.base_estimators = base_estimators self.estimator = MetaEstimator() self.random_state = random_state self.fit_cv = cv def fit(self, X, y): cv = KFold(n_splits=self.fit_cv, shuffle=True, random_state=self.random_state) predictions = [] for estimator in self.base_estimators: name = estimator.__class__.__name__ log(0x25, 'cross_val_predict start', name) prediction = cross_val_predict(estimator, X, y, cv=cv, method='predict_proba') log(0x25, 'cross_val_predict end', name) # print('prediction of', estimator.__class__.__name__) # print(prediction) log(0x25, 'CV Score', name, check_result(y, prediction)) predictions.append(prediction.T[0]) # print('all predictions') # print(np.array(predictions), y) self.estimator.fit(np.array(predictions).T, y) for estimator in self.base_estimators: name = estimator.__class__.__name__ log(0x25, 'fit start', name) estimator.fit(X, y) log(0x25, 'fit end:', name) def predict(self, X, margin): return np.array(self.predict_proba(X)[:, 0]) > margin def predict_proba(self, X): predictions = [] for estimator in self.base_estimators: # predictions.extend(estimator.predict_proba(X).T) predictions.append(estimator.predict_proba(X).T[0]) return self.estimator.predict_proba(np.array(predictions).T)
def main(): parser = argparse.ArgumentParser(description="""Export AMT""") parser.add_argument('--input', default="../res/dga_extendedamt_simplemajority.tsv") parser.add_argument('--dump_to_predict', default="../res/dga_data_october2016.tsv") parser.add_argument('--embeddings', default="/Users/hmartine/data/glove.6B/glove.6B.50d.txt") args = parser.parse_args() E = load_embeddings(args.embeddings) predarrays = {} variants = ["bcd","cd"] for variant in variants: #1 collect features for train trainfeatures, labels, vec = collect_features(args.input,embeddings=E,variant=variant,vectorize=False) maxent = LogisticRegression(penalty='l2') #TODO collect features for new data #TODO proper vectorization dumpfeatdicts = features_from_dump(args.dump_to_predict,variant=variant,embeddings=E,bowfilter=trainingbow) #dumpfeats = vec.fit_transform(dumpfeatdicts) vec = DictVectorizer() X_train = vec.fit_transform(trainfeatures) maxent.fit(X_train,labels) X_test = vec.transform(dumpfeatdicts) predarrays[variant+"_pred_label"] = ["SAME" if x == 0 else "OMISSION" for x in maxent.predict(X_test)] predarrays[variant + "_pred_prob"] = ['{:.2}'.format(y) for x,y in maxent.predict_proba(X_test)] #maxent.fit(np.array(allfeatures[:len(labels)]),labels) #print(maxent.predict(allfeatures[len(labels):])) # predict using {features, features without lenght} --> instance 'variants' properly #TODO compare prediction similarity #TODO provide an output format with labels and probs for both feature templates frame = read_dump(args.dump_to_predict) keyindices = sorted(predarrays.keys()) header = "Index Ref TitleRef URLRef Target TitleTarget URLTarget Source Contains BCD_label BCD_prob CD_label CD_prob".replace(" ","\t") print(header) for a in zip([str(x) for x in range(len(frame.Ref))],list(frame.Ref),list(frame.Target),list(frame.TitleRef),list(frame.URLRef),list(frame.TitleTarget),list(frame.URLTarget),list(frame.Source),list(frame.Contains),predarrays[keyindices[0]],predarrays[keyindices[1]],predarrays[keyindices[2]],predarrays[keyindices[3]]): print("\t".join(a))
def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test): ''' 开始构建模型 Args: X_train_d: 离散特征训练数据 X_train_c: 连续特征训练数据 X_test_d: 离散特征测试数据 X_test_c: 连续特征测试数据 y_train: 训练数据标记 {-1, 1} y_test: 测试数据标记 {-1, 1} Returns: gbc_enc: GBDT OneHotEncoder gbc: GBDT模型 comb_model: 训练得到的组合模型 threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例 comb_model_auc: 模型AUC precision: 模型精度 recall: 模型召回率 ''' if self._random_state is not None: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train) else: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train) X_train_leaves = gbc.apply(X_train_c)[:,:,0] X_test_leaves = gbc.apply(X_test_c)[:,:,0] (X_train_rows, cols) = X_train_leaves.shape gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d]) X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d]) log.debug("Combine features done.") comb_model = LogisticRegression().fit(X_train_ext, y_train) log.debug("Training done.") comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred) ap = average_precision_score(y_test, comb_model_pred) recall_meet = recall >= self._recall_rate recall_meet_min = len([item for item in recall_meet if item == True]) threshold = thresholds[recall_meet_min-1] log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1]) comb_model_auc = roc_auc_score(y_test, comb_model_pred) log.debug("AUC score is: %f", comb_model_auc) return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
def regress_on_words(self, word_index, X): """ word: The word that we are interested in text_corpus: input """ labels = [] tmp_X = X # Avoid directly changing the variable for idx, sentence in enumerate(X): if (sentence[word_index] == 1): # tmp_X[idx][word_index] = 0 labels.append(1) else: labels.append(0) # Build the logistic regression model log_reg = LogisticRegression() log_reg.fit(tmp_X, labels) probs = log_reg.predict_proba(tmp_X)[:, -1] return probs
def validate_model(X, y, N, digit, classifier): """ This function validate the model by K-fold cross validation and print the ROC curves in the result folder :param X: nparray, one row is one sample :param y: nparray, labels :param out: output filename :param N: number of CPU cores to use :param digit: digit of the captcha :param classifier: which classifier to use :return: None """ # K-fold cross validation folds = KFold(n_splits=5, shuffle=True, random_state=1234567).split(X) fold_r = fold_result() labels = np.unique(y) category_rs = [None] * len(labels) for label in labels: category_rs[label] = category_result() for train, test in folds: X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] if (classifier == 'Logistic'): clf = LogisticRegression(solver='sag', n_jobs=N) else: clf = RandomForestClassifier(n_estimators=200, random_state=1234567, n_jobs=N) clf.fit(X_train, y_train) probas = clf.predict_proba(X_test) fold_r.append(clf, X_train, y_train, X_test, y_test) for label in labels: category_rs[label].append(y_test, probas[:, label], label) fold_r.print_score(digit) # print and save ROCS for label in labels: category_rs[label].print_result(label, digit)
def main(): X = df_train.drop(['cust_id', 'y', 'cust_group'], axis=1, inplace=False) y = df_train['y'] X_train,X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(X_train.shape, X_test.shape) # X_train=extract_feature(X_train,y_train) clf=LogisticRegression(C=1.0,max_iter=100,random_state=10) print("===="*20) clf.fit(X_train, y_train) prob = clf.predict_proba(X_test) pred = np.argmax(prob, axis=1) print("mean_squared_error:", mean_squared_error(y_test, prob[:, 1])) print("log_loss:", log_loss(y_test.astype(int), prob[:, 1])) print("roc_auc_score:", roc_auc_score(y_test, prob[:, 1])) # high_danger_prob=prob[:, 1] # print(high_danger_prob) # print("调参") # tune_params(X_test, y_test) predict(clf)
def gbdt_lr_clf(X_train_data,Y_train_data): n_estimators = [10,20,30,40,50] estimator_best = 0 f1_best = 0 gblr_p_t = [] gblr_r_t = [] gblr_f1_t = [] for item in n_estimators: grd = ensemble.GradientBoostingClassifier(n_estimators = item) stratified_folder = StratifiedKFold(n_splits=5, random_state=0, shuffle=False) print('gbdt_classifier + LR:') gblr_p = [] gblr_r = [] gblr_f1 = [] for X_train_index, X_test_index in stratified_folder.split(X_train_data, Y_train_data): X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train_data[X_train_index], Y_train_data[X_train_index], test_size=0.5) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_train_data[X_test_index])[:, :, 0]))[:, 1] y_pred_grd_lm = (y_pred_grd_lm>=0.5)*1 gblr_p_tmp = precision_score(Y_train_data[X_test_index], y_pred_grd_lm) gblr_p.append(gblr_p_tmp) gblr_p_t.append(gblr_p_tmp) gblr_r_tmp = recall_score(Y_train_data[X_test_index], y_pred_grd_lm) gblr_r.append(gblr_r_tmp) gblr_r_t.append(gblr_r_tmp) gblr_f1_tmp = f1_score(Y_train_data[X_test_index], y_pred_grd_lm) gblr_f1.append(gblr_f1_tmp) gblr_f1_t.append(gblr_f1_tmp) print("n_estimators:%f,gblr_p:%f,gblr_r:%f,gblr_f1:%f" % (item,sum(gblr_p) / len(gblr_p), sum(gblr_r) / len(gblr_r), sum(gblr_f1) / len(gblr_f1))) if f1_best < sum(gblr_f1) / len(gblr_f1): estimator_best = item return gblr_r_t, gblr_p_t, gblr_f1_t,estimator_best
def GBDT_LR_test(X_train,Y_train,X_test,glr_norm,dim_para): ''' :param X_train: train data :param Y_train: train label :param X_test: test data 说明卡号在第一类,第二列之后才是数据 :param glr_norm: 0: 不进行归一化 1:归一化(0,1) 2:标准化 :param dim_para: PCA 降维 0:不进行降维 其他:降维 :return: 输出卡号 ''' print("GBDT_LR_model:") X_test_org = X_test X_test = X_test_org[:, 1:] if glr_norm == 0: pass if glr_norm == 1 or 2: X_train, X_test = norm_data(X_train, X_test, glr_norm) if dim_para == 0: pass if dim_para != 0: X_train, X_test = dim_reduction(X_train, X_test, dim_para) gblr_r, gblr_p, gblr_f1, estimator_best = gbdt_lr_clf(X_train, Y_train) grd = ensemble.GradientBoostingClassifier(n_estimators = estimator_best) X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, Y_train, test_size=0.5) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) gblr_predictions = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] gblr_label = (gblr_predictions>=0.5) * 1 print("gbdt_lr predict positive label number: %d" % (sum(gblr_label == 1))) card_list = X_test_org[gblr_label == 1,0] return card_list
def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser( description= "Skeleton for features and classifier for CWI-2016--optimisation of threshhold" ) parser.add_argument('--threshold', type=float, default=0.5) parser.add_argument('--annotator', type=str, default="03") parser.add_argument('--penalty', type=str, choices=["l1", "l2"], default="l1") args = parser.parse_args() current_single_ann = scriptdir + "/../data/cwi_training/cwi_training_" + args.annotator + ".lbl.conll" testfile = scriptdir + "/../data/cwi_testing/cwi_testing.txt.lbl.conll" X__dict_train, y_train, v_train = feats_and_classify.collect_features( current_single_ann, vectorize=False) X_dict_test, y_test, v_test = feats_and_classify.collect_features( testfile, vectorize=False) featdicts = list([x for x in X__dict_train + X_dict_test]) vect = DictVectorizer() X = vect.fit_transform(featdicts).toarray() X_train = X[:len(y_train)] X_test = X[len(y_train):] maxent = LogisticRegression(penalty=args.penalty) maxent.fit(X_train, y_train) y_pred_proba = maxent.predict_proba(X_test) ypred_i = [ "1" if pair[1] >= args.threshold else "0" for pair in y_pred_proba ] fout = open(args.annotator + ".pred", mode="w") print("\n".join(ypred_i), file=fout) fout.close() sys.exit(0)
def test_nnet(n_samples=200, n_features=5, distance=0.5): X, y = make_blobs(n_samples=n_samples, n_features=5, centers=[numpy.ones(n_features) * distance, - numpy.ones(n_features) * distance]) nn_types = [ nnet.SimpleNeuralNetwork, nnet.MultiLayerNetwork, nnet.SoftmaxNeuralNetwork, nnet.RBFNeuralNetwork, nnet.PairwiseNeuralNetwork, nnet.PairwiseSoftplusNeuralNetwork, ] for loss in nnet.losses: for NNType in nn_types: for trainer in nnet.trainers: nn = NNType(layers=[5], loss=loss, trainer=trainer, random_state=42) nn.fit(X, y, stages=100, verbose=nnet.SILENT) print(roc_auc_score(y, nn.predict_proba(X)[:, 1]), nn) lr = LogisticRegression().fit(X, y) print(lr, roc_auc_score(y, lr.predict_proba(X)[:, 1])) assert 0 == 1
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.metrics import roc_curve, auc df = pd.read_csv('sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split( df['message'], df['label']) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) false_positive_rate, recall, thresholds = roc_curve(y_test, predictions[:, 1]) roc_auc = auc(false_positive_rate, recall) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('Recall') plt.xlabel('Fall-out') plt.show()
def main(): global train global test # 将训练集Y数据存储在y中,并删除训练集Y数据 y = train['Y'] del train['Y'] # 重命名列标题 origin = [ "年龄", "工作天数", "职业类型", "投资收入", "投资损失", "省份", "教育", "家庭角色", "婚姻状况", "教育时间", "民族", "工作情况", "性别" ] target = [ "age", "work_days", "job", "invest_income", "invest_loss", "province", "education", "home_role", "marital_status", "education_time", "nation", "work_type", "gender" ] rename_dict = dict() for i in range(len(origin)): rename_dict[origin[i]] = target[i] train.rename(columns=rename_dict, inplace=True) test.rename(columns=rename_dict, inplace=True) # 查看是否有缺失数据 print("===================统计缺失数据-训练集====================") print(train.isnull().sum(axis=0)) print(train.isnull().any()) print("===================统计缺失数据-测试集====================") print(test.isnull().sum(axis=0)) print(test.isnull().any()) full_data = [train, test] # 性别特征转为数字 for dataset in full_data: dataset['gender'] = dataset['gender'].map({'女': 0, '男': 1}).astype(int) # 处理投资收益,分为五类 for dataset in full_data: dataset['invest'] = dataset['invest_income'] - dataset['invest_loss'] for dataset in full_data: dataset.loc[dataset['invest'] < 0, 'invest'] = 0 dataset.loc[dataset['invest'] == 0, 'invest'] = 1 dataset.loc[(dataset['invest'] > 0) & (dataset['invest'] <= 5000), 'invest'] = 2 dataset.loc[(dataset['invest'] > 5000) & (dataset['invest'] <= 10000), 'invest'] = 3 dataset.loc[dataset['invest'] > 10000, 'invest'] = 4 # 处理省份为数字 for dataset in full_data: province_list = [] for province_name in dataset['province']: province_list.append(int(province_name.replace("省份", "")) / 2) dataset['province'] = np.array(province_list) # 分类特征转为哑变量(数字分类) dumb_columns('job') # 职业类型 dumb_columns('education') # 教育 dumb_columns('nation') # 民族 dumb_columns('home_role') # 家庭角色 dumb_columns('marital_status') # 婚姻状况 dumb_columns('work_type') # 工作情况 # 年龄分类-五类 for dataset in full_data: # Mapping Age dataset.loc[dataset['age'] <= 22, 'age'] = 0 dataset.loc[(dataset['age'] > 22) & (dataset['age'] <= 32), 'age'] = 1 dataset.loc[(dataset['age'] > 32) & (dataset['age'] <= 48), 'age'] = 2 dataset.loc[(dataset['age'] > 48) & (dataset['age'] <= 64), 'age'] = 3 dataset.loc[dataset['age'] > 64, 'age'] = 4 # 工作天数-按比例缩小,防止维度之间差异过大 for dataset in full_data: dataset['work_days'] = dataset['work_days'] / 10 # 删除不必要的列 drop_elements = ['invest_income', 'invest_loss', 'education'] train = train.drop(drop_elements, axis=1) test = test.drop(drop_elements, axis=1) # 显示训练集和测试集特征 print(train.head(3)) print("===") print(test.head(3)) # 模型列表 model_list = [] # =====================逻辑回归=================== # 分割数据 x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=24) # estimator logic = LogisticRegression() logic.fit(x_train, y_train) # 预测 print( "精确率和召回率(逻辑回归):", classification_report(y_test, logic.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) pre_score = logic.score(x_test, y_test) print("准确率(逻辑回归):{}".format(pre_score)) # 输出概率 predictions = logic.predict_proba(x_test) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value)) model_list.append({"model": logic, "auc": auc_value}) # 绘图 plt.title('LogisticRegression AUC') plt.plot(fpr, tpr, 'r', label='AUC_LOGIC = %0.3f' % auc_value) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # plt.savefig("./LogisticRegression_auc.png") # ==============决策树=========== # 数据集分割 x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=24) # 转换为字典数据,并进行特征抽取 dc = DictVectorizer(sparse=False) x_train = dc.fit_transform(x_train.to_dict(orient="records")) features = dc.get_feature_names() x_test = dc.transform(x_test.to_dict(orient="records")) # estimator dec = DecisionTreeClassifier(max_depth=4) dec.fit(x_train, y_train) # 决策树本地保存 # dot -Tpng -o tree.png tree.dot export_graphviz(dec, out_file="./tree.dot", feature_names=features) # 预测 print( "精确率和召回率(决策树):", classification_report(y_test, dec.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) pre_score = dec.score(x_test, y_test) print("准确率(决策树):{}".format(pre_score)) # 输出概率 predictions = dec.predict_proba(x_test) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value)) model_list.append({"model": dec, "auc": auc_value}) # 绘图 plt.title('DecisionTreeClassifier AUC') plt.plot(fpr, tpr, 'b', label='AUC_DTC = %0.3f' % auc_value) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # plt.savefig("./DecisionTreeClassifier_auc.png") # =============随机森林============== # 数据集分割 x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=24) # 转换为字典数据,并进行特征抽取 dc = DictVectorizer(sparse=False) x_train = dc.fit_transform(x_train.to_dict(orient="records")) # print(dc.get_feature_names()) x_test = dc.transform(x_test.to_dict(orient="records")) # estimator rf = RandomForestClassifier(n_estimators=5) rf.fit(x_train, y_train) # 预测 print( "精确率和召回率(随机森林):", classification_report(y_test, rf.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) pre_score = rf.score(x_test, y_test) print("准确率(随机森林):{}".format(pre_score)) # 输出概率 predictions = rf.predict_proba(x_test) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value)) model_list.append({"model": rf, "auc": auc_value}) # 绘图 plt.title('RandomForestClassifier AUC') plt.plot(fpr, tpr, 'y', label='AUC_RF = %0.3f' % auc_value) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') plt.savefig("./count_auc.png") # 模型对比,选择auc值最大的模型进行预测 sorted_key_list = sorted(model_list, key=lambda x: x['auc'], reverse=True) model = sorted_key_list[0]['model'] auc_v = sorted_key_list[0]['auc'] print("选择模型 {}".format(model)) print("AUC值为 {}".format(auc_v)) pre_data = model.predict_proba(test) # 保存目标值 test['Y'] = pre_data[:, 0] test['Y'].to_csv('Results_1.csv', encoding='utf-8', index=False, header=False) # 保存完整版本 test_origin['Y'] = pre_data[:, 0] test_origin.to_csv("./my_results.csv", encoding='utf-8', index=False)
def train(): weather = load_weather() training = load_training() X, y = assemble_X_y(training, weather) mean, std = normalize(X) #y = assemble_y(training) ''' input_size = len(X[0]) learning_rate = theano.shared(np.float32(0.1)) net = NeuralNet( layers=[ ('input', InputLayer), ('hidden1', DenseLayer), ('dropout1', DropoutLayer), ('hidden2', DenseLayer), ('dropout2', DropoutLayer), ('output', DenseLayer), ], # layer parameters: input_shape=(None, input_size), hidden1_num_units=256, dropout1_p=0.4, hidden2_num_units=256, dropout2_p=0.4, output_nonlinearity=sigmoid, output_num_units=1, # optimization method: update=nesterov_momentum, update_learning_rate=learning_rate, update_momentum=0.9, # Decay the learning rate on_epoch_finished=[ AdjustVariable(learning_rate, target=0, half_life=4), ], # This is silly, but we don't want a stratified K-Fold here # To compensate we need to pass in the y_tensor_type and the loss. regression=True, y_tensor_type = T.imatrix, objective_loss_function = binary_crossentropy, max_epochs=32, eval_size=0.1, verbose=1, ) ''' clf = LogisticRegression(C = 10) #clf = svm.SVC() X, y = shuffle(X, y, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33) clf.fit(X_train, y_train) probas = clf.predict_proba(X_test)[:,1] print("ROC score", metrics.roc_auc_score(np.ravel(y_test), probas)) print("fitting...") clf.fit(X, y) #clf.fit(X[:100, :], y[:100]) #Tracer()() #probas = clf.predict(X[:100, :])[:,1] #y_pred = (probas > 0.5).astype(int) #print(np.abs(y_pred-y[:100]).sum()) return clf, mean, std
def train_model(X_train, y_train, X_test, y_test, name, plot=False): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. """ labels = np.unique(y_train) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] # print "X_train::" # print X_train # print "X_test::" # print X_test # print "y_train::" # print y_train # print "y_test::" # print y_test clf = LogisticRegression() #clf=GaussianNB() #clf=SVC(probability=True) clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) print "train_score:: " + str(train_score) print "test_score:: " + str(test_score) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) print y_pred cm = confusion_matrix(y_test, y_pred) cms.append(cm) # cms = np.asarray(cms) # cm_avg = np.mean(cms, axis=0) # cm_norm = cm_avg / np.sum(cm_avg, axis=0) # plot_confusion_matrix(cm_norm, genre_list, "ceps","CEPS classifier - Confusion matrix") for label in labels: #print "label "+str(label) y_label_test = np.asarray(y_test == label, dtype=int) #print "y_label_test "+str(y_label_test) proba = clf.predict_proba(X_test) #print str(len(proba))+"proba "+str(proba) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) #sys.exit(1) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model/model_ceps.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def train_model(X, Y, name, plot=False, outModelName=outModelName, testSize=0.3): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. Parameters ---------- outModelName : path to save the trained model (*.pkl) testsize : fracion of the data used for testing Returns ------- outModelName, np.mean(train_errors) np.mean(test_errors) np.asarray(cms) """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=testSize, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk if outModelName: joblib.dump(clf, outModelName) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def test_fit_credit_backupsklearn(): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.LogisticRegression enet_h2o4gpu = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet_h2o4gpu.fit(X, y) print("h2o4gpu predict()") print(enet_h2o4gpu.predict(X)) print("h2o4gpu score()") print(enet_h2o4gpu.score(X,y)) enet = Solver(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet.predict(X)) print("h2o4gpu scikit wrapper predict_proba()") print(enet.predict_proba(X)) print("h2o4gpu scikit wrapper predict_log_proba()") print(enet.predict_log_proba(X)) print("h2o4gpu scikit wrapper score()") print(enet.score(X,y)) print("h2o4gpu scikit wrapper decision_function()") print(enet.decision_function(X)) print("h2o4gpu scikit wrapper densify()") print(enet.densify()) print("h2o4gpu scikit wrapper sparsify") print(enet.sparsify()) from sklearn.linear_model.logistic import LogisticRegression enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit predict_proba()") print(enet_sk.predict_proba(X)) print("Scikit predict_log_proba()") print(enet_sk.predict_log_proba(X)) print("Scikit score()") print(enet_sk.score(X,y)) print("Scikit decision_function()") print(enet_sk.decision_function(X)) print("Scikit densify()") print(enet_sk.densify()) print("Sciki sparsify") print(enet_sk.sparsify()) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet.coef_) print(enet_sk.intercept_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet.coef_, enet_sk_coef) assert np.allclose(enet.intercept_, enet_sk.intercept_) assert np.allclose(enet.n_iter_, enet_sk.n_iter_) print("Preds should match") assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X)) assert np.allclose(enet.predict(X), enet_sk.predict(X)) assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
def train_model(X, Y, name, plot=False): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. """ labels = np.unique(Y) print labels cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model/model_ceps.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def train_reg(reg, clazz, X, X_val, two_class_y, two_class_y_val): print 'Training clazz', clazz, 'with C=', reg model = LogisticRegression('l1', False, C=reg) model.fit(X, two_class_y) precision = functions.precision(model.predict_proba(X_val), two_class_y_val) return model, precision
def train_model(clf_factory, X, Y, name, plot=False): """ Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit( n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0) #print "cv = ",cv train_errors = [] test_errors = [] scores = [] pr_scores, precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list) roc_scores, tprs, fprs = defaultdict(list), defaultdict(list) ,defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] global clf clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: #print("Plotting %s"%genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) #plot_pr(pr_scores[label][median], desc, precisions[label][median],recalls[label][median], label='%s vs rest' % genre_list[label]) #plot_roc(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores),np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model_fft/my_model.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
train_visit_df = pd.read_csv("%s/../input/coupon_visit_train.csv" % script_path) test_coupon_df = pd.read_csv("%s/../input/coupon_list_test.csv" % script_path) # create train_df train_df = pd.merge(train_visit_df, train_coupon_df, left_on="VIEW_COUPON_ID_hash", right_on="COUPON_ID_hash") train_df = pd.merge(train_df, user_df, left_on="USER_ID_hash", right_on="USER_ID_hash") # create train feature fu_obj = FeatureUnion(transformer_list=feature_list) X_train = fu_obj.fit_transform(train_df) y_train = train_df["PURCHASE_FLG"] assert X_train.shape[0] == y_train.size # fit model clf = LogisticRegression() clf.fit(X_train, y_train) # create test_df test_coupon_df["cross"] = 1 user_df["cross"] = 1 test_df = pd.merge(test_coupon_df, user_df, on="cross") # create test Feature X_test = fu_obj.transform(test_df) # predict test data predict_proba = clf.predict_proba(X_test) pos_idx = np.where(clf.classes_ == True)[0][0] test_df["predict"] = predict_proba[:, pos_idx] top10_coupon = test_df.groupby("USER_ID_hash").apply(top_merge) top10_coupon.name = "PURCHASED_COUPONS" top10_coupon.to_csv("submission.csv", header=True)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.metrics import roc_curve, auc df = pd.read_csv('data/sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label']) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) predictions = classifier.predict_proba(X_test) false_positive_rate, recall, thresholds = roc_curve(y_test, predictions[:, 1]) roc_auc = auc(false_positive_rate, recall) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('Recall') plt.xlabel('Fall-out') plt.show() ################# Sample 8 ################# """
from sklearn import svm from sklearn.linear_model.logistic import LogisticRegression from sklearn.linear_model import RidgeCV, LassoCV from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier import csv data = [] mark = [] with open('/Users/hhy/Desktop/1/test.csv', 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f) for x in csv_reader: data.append(list(map(float, x[0:-1]))) mark.append(float(x[-1])) auc = [] acc = [] f1 = [] for i in range(10): X_train, X_test, y_train, y_test = cross_validation.train_test_split( data, mark, test_size=0.05, random_state=i) clf = LogisticRegression(C=4.8, random_state=1113) clf.fit(X_train, y_train) y_predict = clf.predict_proba(X_test)[:, 1] test_auc = metrics.roc_auc_score(y_test, y_predict) # 验证集上的auc值 auc.append(test_auc) y_pred = clf.predict(X_test) acc.append(metrics.accuracy_score(y_test, y_pred)) f1.append(metrics.f1_score(y_test, y_pred)) print("acc==", sum(acc) / len(acc)) print("auc==", sum(auc) / len(auc)) print("f1==", sum(f1) / len(f1))
from sklearn.metrics import classification_report import time #计算运行时间 start_time = time.time() path = "E:/Desktop/Image/SVMData/gender_wechat_scale.txt" x,y = readData(path) average = 0 testNum = 10 clf = LogisticRegression() print clf for i in range(0,testNum): x_train, x_test, y_train, y_test = train_test_split(x, y) clf = LogisticRegression() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) p = np.mean(y_pred == y_test) print(p) average += p answer = clf.predict_proba(x_test)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, answer) report = answer > 0.5 print(classification_report(y_test, report, target_names = ['neg', 'pos'])) print("average precision:", average/testNum) print("time spent:", time.time() - start_time)