from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_digits import matplotlib.pyplot as plt X, y = load_digits(return_X_y=True) pipe = make_pipeline(MinMaxScaler(), LogisticRegression(random_state=42, max_iter=1000)) scores = cross_validate(pipe, X, y, cv=3, return_train_score=True) import pandas as pd df_scores = pd.DataFrame(scores) df_scores df_scores.mean() df_scores[['train_score', 'test_score']].boxplot() scores = cross_validate(pipe, X, y, cv=10, return_train_score=True) df_scores = pd.DataFrame(scores) df_scores df_scores[['train_score', 'test_score']].boxplot()
''' import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, classification_report df = pd.read_csv('../datasets/diabetes.csv') X = df.drop(['diabetes'], axis=1) y = df['diabetes'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) logreg = LogisticRegression() logreg.fit(X_train, y_train) ''' INSTRUCTIONS * Import roc_curve from sklearn.metrics. * Using the logreg classifier, which has been fit to the training data, compute the predicted probabilities of the labels of the test set X_test. Save the result as y_pred_prob. * Use the roc_curve() function with y_test and y_pred_prob and unpack the result into the variables fpr, tpr, and thresholds. * Plot the ROC curve with fpr on the x-axis and tpr on the y-axis. ''' # Import necessary modules from sklearn.metrics import roc_curve
def get_dataset(): x, y = load_iris(return_X_y=True) random_state = np.random.RandomState(2020) n_samples, n_features = x.shape # 为数据增加噪音维度以便更好观察pr曲线 x = np.concatenate([x, random_state.randn(n_samples, 200 * n_features)], axis=1) # 针对二分类下的pr曲线 x_train, x_test, y_train, y_test = train_test_split( x[y < 2], y[y < 2], test_size=0.5, random_state=random_state) return x_train, x_test, y_train, y_test if __name__ == '__main__': x_train, x_test, y_train, y_test = get_dataset() model = LogisticRegression() model.fit(x_train, y_train) y_scores = model.predict_proba(x_test) precision, recall, _ = p_r_curve(y_test, y_scores[:, 1]) ap = compute_ap(recall, precision) plt.plot(recall, precision, drawstyle="steps-post", label=f'LogisticRegression (AP={ap})') plt.legend(loc="lower left") plt.xlabel("Recall (Positive label: 1)") plt.ylabel("Precision (Positive label: 1)") # 通过sklear方法进行绘制 plot_precision_recall_curve(model, x_test, y_test) plt.show()
# Create a pipeline that extracts features from the data then creates a model from pandas import read_csv from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest # load data filename = 'pima-indians-diabetes.data.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] # create feature union features = [] features.append(('pca', PCA(n_components=3))) features.append(('select_best', SelectKBest(k=6))) feature_union = FeatureUnion(features) # create pipeline estimators = [] estimators.append(('feature_union', feature_union)) estimators.append(('logistic', LogisticRegression(solver='liblinear'))) model = Pipeline(estimators) # evaluate pipeline kfold = KFold(n_splits=10, random_state=7, shuffle=True) results = cross_val_score(model, X, Y, cv=kfold) print(results.mean())
# 二值化,阈值设定为 0.5 ,返回二值化的数据 b_data=Binarizer(threshold=0.5).fit_transform(boston.data) print(b_data[0:5,:]) # 哑编码,对boston数据集的目标值,返回值为哑编码后的数据 o_target=OneHotEncoder().fit_transform(boston.target) print(o_target[0:5]) ###特征选择### #方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值 VarianceThreshold(threshold=3).fit_transform(iris.data) # 卡方检验,选择K个最好的特征,返回选择特征后的数据 select_data=SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target) # 递归特征消除法,返回特征选择后的数据 # 参数estimator为基模型 # 参数n_features_to_select为选择的特征个数 RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target) #带L1惩罚项的逻辑回归作为基模型的特征选择 SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target) #GBDT作为基模型的特征选择 SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', svm.LinearSVC(C=1.0)) ]) svm_clf.fit(X_train, y_train) svm_predicted = svm_clf.predict(X_test) print(metrics.confusion_matrix(y_test, svm_predicted)) print(np.mean(svm_predicted==y_test)) print(metrics.classification_report(y_test, svm_predicted)) from sklearn.linear_model import LogisticRegression lr_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression()) ]) lr_clf.fit(X_train, y_train) lr_predicted = lr_clf.predict(X_test) print(metrics.confusion_matrix(y_test, lr_predicted)) print(np.mean(lr_predicted==y_test)) print(metrics.classification_report(y_test, lr_predicted)) lr_clf = LogisticRegression() # lr_clf.fit(dtm, y_train) lr_clf.fit(X_train_tf, y_train) lr_clf_coef = ( pd.DataFrame(lr_clf.coef_[0], index=dtm.columns) .rename(columns={0:'Coefficient'})
def main(): p = optparse.OptionParser() p.add_option('--attr', '-a', type = str, help = 'attribute') p.add_option('--attr_type', '-t', type = str, help = 'attribute type') p.add_option('--num_train_each', '-n', type = int, help = 'number of training samples of True and False for the attribute (for total of 2n training samples)') p.add_option('--embedding', '-e', type = str, help = 'embedding (adj, normlap, regnormlap)') p.add_option('-k', type = int, help = 'number of eigenvalues') p.add_option('--sphere', '-s', action = 'store_true', default = False, help = 'normalize in sphere') p.add_option('--num_samples', '-S', type = int, default = 50, help = 'number of Monte Carlo samples') p.add_option('-v', action = 'store_true', default = False, help = 'save plot') p.add_option('--jobs', '-j', type = int, default = -1, help = 'number of jobs') opts, args = p.parse_args() attr, attr_type, num_train_each, embedding, k, sphere, num_samples, save_plot, jobs = opts.attr, opts.attr_type, opts.num_train_each, opts.embedding, opts.k, opts.sphere, opts.num_samples, opts.v, opts.jobs folder = 'gplus0_lcc/baseline5/' agg_precision_filename = folder + '%s_%s_n%d_%s_k%d%s_precision.csv' % (attr_type, attr, num_train_each, embedding, k, '_normalize' if sphere else '') plot_filename = folder + '%s_%s_n%d_%s_k%d%s_precision.png' % (attr_type, attr, num_train_each, embedding, k, '_normalize' if sphere else '') top_attrs_filename = folder + '%s_%s_n%d_%s_k%d%s_top_attrs.txt' % (attr_type, attr, num_train_each, embedding, k, '_normalize' if sphere else '') print("\nNominating nodes with whose '%s' attribute is '%s' (%d pos/neg seeds)..." % (attr_type, attr, num_train_each)) print("\nLoading AttributeAnalyzer...") a = AttributeAnalyzer(load_data = False) sqrt_samples = np.sqrt(num_samples) try: agg_precision_df = pd.read_csv(agg_precision_filename) print("\nLoaded data from '%s'." % agg_precision_filename) selected_attrs = pd.read_csv('selected_attrs.csv') if (attr in list(selected_attrs['attribute'])): row = selected_attrs[selected_attrs['attribute'] == attr].iloc[0] num_true_in_test = row['freq'] - num_train_each num_test = row['totalKnown'] - 2 * num_train_each else: ind = a.get_attribute_indicator(attr, attr_type) num_true_in_test = len(ind[ind == 1]) - num_train_each num_test = ind.count() - 2 * num_train_each except OSError: print("\nLoading attribute data...") timeit(a.load_data)() a.make_joint_attr_embedding_matrix(attr_type, sim = sim, embedding = embedding, delta = delta, tau = tau, k = k, sphere = 2 if sphere else 0) # get attribute indicator for all the nodes attr_indicator = a.get_attribute_indicator(attr, attr_type) # prepare the classifiers rfc = RandomForestClassifier(n_estimators = num_rf_trees, n_jobs = jobs) boost = AdaBoostClassifier(n_estimators = num_boost_trees) logreg = LogisticRegression(n_jobs = jobs) gnb = GaussianNB() rfc_precision_df = pd.DataFrame(columns = range(num_samples)) boost_precision_df = pd.DataFrame(columns = range(num_samples)) logreg_precision_df = pd.DataFrame(columns = range(num_samples)) gnb_precision_df = pd.DataFrame(columns = range(num_samples)) # maintain top nominee attributes dictionary top_attrs = defaultdict(float) for s in range(num_samples): print("\nSEED = %d" % s) np.random.seed(s) print("\nObtaining feature vectors for random training and test sets...") ((train_in, train_out), (test_in, test_out)) = timeit(a.get_joint_PMI_training_and_test)(attr, attr_type, num_train_each) # train and predict print("\nTraining %d random forest trees..." % num_rf_trees) timeit(rfc.fit)(train_in, train_out) print("\nPredicting probabilities...") probs_rfc = timeit(rfc.predict_proba)(test_in)[:, 1] print("\nTraining %d AdaBoost trees..." % num_boost_trees) timeit(boost.fit)(train_in, train_out) print("\nPredicting probabilities...") probs_boost = timeit(boost.predict_proba)(test_in)[:, 1] print("\nTraining logistic regression...") timeit(logreg.fit)(train_in, train_out) print("\nPredicting probabilities...") probs_logreg = timeit(logreg.predict_proba)(test_in)[:, 1] print("\nTraining Naive Bayes...") timeit(gnb.fit)(train_in, train_out) print("\nPredicting probabilities...") probs_gnb = timeit(gnb.predict_proba)(test_in)[:, 1] test_df = pd.DataFrame(columns = ['test', 'probs_rfc', 'probs_boost', 'probs_logreg', 'probs_gnb']) test_df['test'] = test_out test_df['probs_rfc'] = probs_rfc test_df['probs_boost'] = probs_boost test_df['probs_logreg'] = probs_logreg test_df['probs_gnb'] = probs_gnb # do vertex nomination test_df = test_df.sort_values(by = 'probs_rfc', ascending = False) rfc_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0) test_df = test_df.sort_values(by = 'probs_boost', ascending = False) boost_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0) test_df = test_df.sort_values(by = 'probs_logreg', ascending = False) logreg_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0) test_df = test_df.sort_values(by = 'probs_gnb', ascending = False) gnb_precision_df[s] = np.asarray(test_df['test']).cumsum() / np.arange(1.0, len(test_out) + 1.0) # determine top attributes best_i, best_prec = -1, -1.0 for (i, prec_series) in enumerate([rfc_precision_df[s], boost_precision_df[s], logreg_precision_df[s], gnb_precision_df[s]]): if (prec_series[topN_nominees - 1] > best_prec): best_i, best_prec = i, prec_series[topN_nominees - 1] test_df = test_df.sort_values(by = 'probs_%s' % classifiers[i], ascending = False) for node in test_df.index[:topN_nominees]: attrs = a.attrs_by_node_by_type[attr_type][node] for at in attrs: top_attrs[at] += 1.0 / len(attrs) # divide the vote equally among all attributes sys.stdout.flush() # flush the output buffer # compute means and standard errors over all the samples agg_precision_df = pd.DataFrame(columns = ['mean_rfc_prec', 'stderr_rfc_prec', 'mean_boost_prec', 'stderr_boost_prec', 'mean_logreg_prec', 'stderr_logreg_prec', 'mean_gnb_prec', 'stderr_gnb_prec', 'max_mean_prec']) agg_precision_df['mean_rfc_prec'] = rfc_precision_df.mean(axis = 1) agg_precision_df['stderr_rfc_prec'] = rfc_precision_df.std(axis = 1) / sqrt_samples agg_precision_df['mean_boost_prec'] = boost_precision_df.mean(axis = 1) agg_precision_df['stderr_boost_prec'] = boost_precision_df.std(axis = 1) / sqrt_samples agg_precision_df['mean_logreg_prec'] = logreg_precision_df.mean(axis = 1) agg_precision_df['stderr_logreg_prec'] = logreg_precision_df.std(axis = 1) / sqrt_samples agg_precision_df['mean_gnb_prec'] = gnb_precision_df.mean(axis = 1) agg_precision_df['stderr_gnb_prec'] = gnb_precision_df.std(axis = 1) / sqrt_samples agg_precision_df['max_mean_prec'] = agg_precision_df[['mean_rfc_prec', 'mean_boost_prec', 'mean_logreg_prec', 'mean_gnb_prec']].max(axis = 1) # save the aggregate data frames N_save = min(len(test_out), topN_save) agg_precision_df = agg_precision_df[:N_save] agg_precision_df.to_csv(agg_precision_filename, index = False) top_attrs_df = pd.DataFrame(list(top_attrs.items()), columns = ['attribute', 'voteProportion']) top_attrs_df = top_attrs_df.set_index('attribute') top_attrs_df['voteProportion'] /= top_attrs_df['voteProportion'].sum() top_attrs_df = top_attrs_df.sort_values(by = 'voteProportion', ascending = False) open(top_attrs_filename, 'w').write(str(top_attrs_df)) num_true_in_test = test_out.sum() num_test = len(test_out) # plot the nomination precision if save_plot: N_plot = min(len(agg_precision_df), topN_plot) plt.fill_between(agg_precision_df.index, agg_precision_df['mean_rfc_prec'] - 2 * agg_precision_df['stderr_rfc_prec'], agg_precision_df['mean_rfc_prec'] + 2 * agg_precision_df['stderr_rfc_prec'], color = 'green', alpha = 0.25) rfc_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_rfc_prec'], color = 'green', linewidth = 2, label = 'Random Forest') plt.fill_between(agg_precision_df.index, agg_precision_df['mean_boost_prec'] - 2 * agg_precision_df['stderr_boost_prec'], agg_precision_df['mean_boost_prec'] + 2 * agg_precision_df['stderr_boost_prec'], color = 'blue', alpha = 0.25) boost_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_boost_prec'], color = 'blue', linewidth = 2, label = 'AdaBoost') plt.fill_between(agg_precision_df.index, agg_precision_df['mean_logreg_prec'] - 2 * agg_precision_df['stderr_logreg_prec'], agg_precision_df['mean_logreg_prec'] + 2 * agg_precision_df['stderr_logreg_prec'], color = 'red', alpha = 0.25) logreg_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_logreg_prec'], color = 'red', linewidth = 2, label = 'Logistic Regression') plt.fill_between(agg_precision_df.index, agg_precision_df['mean_gnb_prec'] - 2 * agg_precision_df['stderr_gnb_prec'], agg_precision_df['mean_gnb_prec'] + 2 * agg_precision_df['stderr_gnb_prec'], color = 'orange', alpha = 0.25) gnb_plot, = plt.plot(agg_precision_df.index, agg_precision_df['mean_gnb_prec'], color = 'orange', linewidth = 2, label = 'Naive Bayes') guess_rate = num_true_in_test / num_test guess, = plt.plot([guess_rate for i in range(N_plot)], linestyle = 'dashed', linewidth = 2, color = 'black', label = 'Guess') plt.xlabel('rank') plt.ylabel('precision') plt.xlim((0.0, N_plot)) plt.ylim((0.0, 1.0)) plt.title('Vertex Nomination Precision') plt.legend(handles = [rfc_plot, boost_plot, logreg_plot, gnb_plot, guess]) plt.savefig(plot_filename) print("\nDone!")
robust_scaler = RobustScaler() X = robust_scaler.fit_transform(X) y=df[target_name] for i in range(10,55): X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=i/100, random_state=123, stratify=y) print(i/10) # Check accuracy of base rate model y_base_rate = base_rate_model(X_test) from sklearn.metrics import accuracy_score acc_score=accuracy_score(y_test, y_base_rate) print("Base rate accuracy is %2.2f" % acc_score) # Check accuracy of Logistic Model from sklearn.linear_model import LogisticRegression model = LogisticRegression(penalty='l2', C=1) model.fit(X_train, y_train) print ("Logistic accuracy is %2.2f" % accuracy_score(y_test, model.predict(X_test))) # Using 10 fold Cross-Validation to train our Logistic Regression Model from sklearn import model_selection from sklearn.linear_model import LogisticRegression kfold = model_selection.KFold(n_splits=10, random_state=7) modelCV = LogisticRegression(class_weight = "balanced") scoring = 'roc_auc' results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring) print("AUC: %.3f (%.3f)" % (results.mean(), results.std())) print("Model Updated")
from sklearn import datasets from sklearn import metrics from sklearn.linear_model import LogisticRegression # Load the iris dataset (1936!)- https://archive.ics.uci.edu/ml/datasets/iris # 150 samples for 3 different types of irises (Setosa, Versicolour and Virginica) # The rows are the samples and the columns are: Sepal Length, Sepal Width, Petal Length and Petal Width. dataset = datasets.load_iris() print(dataset.data.shape) print(dataset.data[:10]) print(dataset.target.shape) print(dataset.target[:10]) # Fit a logistic regression model to the data model = LogisticRegression(solver='liblinear', multi_class='auto') model.fit(dataset.data, dataset.target) # Save model for future use from sklearn.externals import joblib joblib.dump(model, 'irismodel.pkl') # Make predictions expected = dataset.target predicted = model.predict(dataset.data) # Display metrics # Precision measures the impact of false positives: TP/(TP+FP) # Recall measures the impact of false negatives : TP/(TP+FN) # F1 is the weighted average of precision and recall: (2*Recall*Precision)/(Recall+Precision) print(metrics.classification_report(expected, predicted))
################# matplotlib 한글 구현 ############################# from matplotlib import font_manager, rc font_name = font_manager.FontProperties( fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) #################################################################### from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC X, y = mglearn.datasets.make_forge() fig, axes = plt.subplots(1, 2, figsize=(10, 3)) for model, ax in zip([LinearSVC(), LogisticRegression()], axes): clf = model.fit(X, y) mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5, ax=ax, alpha=.7) mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) ax.set_title("{}".format(clf.__class__.__name__)) ax.set_xlabel("특성 0") ax.set_ylabel("특성 1") axes[0].legend() plt.show()
# knn model_knn = KNeighborsClassifier(n_neighbors=10) model_knn.fit(train_vectors, training_task_a_labels_list) prediction_knn = model_knn.predict(test_vectors) print("\nClassification report for K Nearest Neighbour") print(classification_report(test_task_a_labels_list, prediction_knn)) accuracy_knn = round( accuracy_score(test_task_a_labels_list, prediction_knn) * 100, 2) print("Accuracy (KNN) = " + str(accuracy_knn) + " %") print("\nConfusion Matrix (KNN)") cf_matrix_knn = confusion_matrix(test_task_a_labels_list, prediction_knn) print(cf_matrix_knn) # logistic regression model_lr = LogisticRegression() model_lr.fit(train_vectors, training_task_a_labels_list) prediction_lr = model_lr.predict(test_vectors) print("\nClassification report for Logistic Regression") print(classification_report(test_task_a_labels_list, prediction_lr)) accuracy_lr = round( accuracy_score(test_task_a_labels_list, prediction_lr) * 100, 2) print("Accuracy (LR)= " + str(accuracy_lr) + " %") print("\nConfusion Matrix (LR)") cf_matrix_lr = confusion_matrix(test_task_a_labels_list, prediction_lr) print(cf_matrix_lr) labels = ['NOT', 'OFF'] # graph plot svm
from sklearn.cross_validation import train_test_split import numpy as np smote = SMOTE(kind='borderline1', ratio='auto', k=10) param_grid = [{ 'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10, 100], 'class_weight': ['balanced'], 'max_iter': [100, 200, 500, 800, 1000], 'solver': ['liblinear', 'newton-cg', 'lbfgs'], 'multi_class': ['ovr'], 'tol': [1e-4, 1e-3, 1e-2] }] clf = LogisticRegression() start = time() f1_scorer = make_scorer(f1_score) # Oversampling X_tr, X_te, y_tr, y_te = train_test_split(df_reduced_train.values, y_train, test_size=0.3, stratify=y_train) X_tr, y_tr = smote.fit_sample(X_tr, y_tr) gs = GridSearchCV(clf, param_grid, scoring=f1_scorer, n_jobs=-1) gs.fit(X_tr, y_tr) y_pred = gs.predict(X_te)
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.9643563265868098 exported_pipeline = make_pipeline( make_union( FunctionTransformer(copy), FunctionTransformer(copy) ), LogisticRegression(C=0.001, dual=False, penalty="l2") ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
clf_ext = ExtraTreesClassifier( n_estimators=240, max_depth=15, min_samples_split=0.03, min_samples_leaf=5, max_features=24, class_weight='balanced', n_jobs=4, bootstrap=True, oob_score=True, ) # meta_classifier as logistic regression lr_stack = LogisticRegression(class_weight='balanced', solver='sag', max_iter=10000, n_jobs=4, verbose=2) xgb_stack = xgb.XGBClassifier(learning_rate=0.1, n_estimators=600, max_depth=5, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, n_jobs=4) def train_SVM(estimator, trainX, trainY, method, n_jobs=4, skip=False): # SVM logger = misc.init_logger(method)
import pandas as pd from create_dataset import df from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression X= df['data'] y = df['labels'] # X_train,X_test,y_train,y_test = train_test_split(X,pd.get_dummies(y),random_state=2) X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2) model_names = ['LogisticRegression()'] pattern = '[A-Za-z0-9]+(?=\\s+)' for model in model_names: """ TO-DOs 1) convert labels to numbers using pd.get_dummies() ? 2) convert text data to numeric features. 3) Add multiple models """ pl = Pipeline([ ('vectorizer',CountVectorizer()), ('clf',LogisticRegression()) ]) pl.fit(X_train,y_train) accuracy = pl.score(X_test,y_test) print("Acuuracy for {} is {}".format(model,accuracy))
def create_logistic_vectorizer(): vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True) lr = LogisticRegression(random_state=777) return Pipeline([('vectorizer', vectorizer), ('lr', lr)])
#Encoding the categorical variables #for SVM classification train_y_svm = train_y test_y_svm = test_y #for other types of classification train_y = pd.get_dummies(train_y) # train_y_binary=pd.get_dummies(train_y_binary) # train_y_binary=train_y_svm['benign'] test_y = pd.get_dummies(test_y) # test_y_binary_num=pd.get_dummies(test_y_binary) #Applying Logistic regression for multiclass classification from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression ovr_clf = OneVsRestClassifier(LogisticRegression()) ovr_clf.fit(train_x, train_y) ovr_predicted = ovr_clf.predict(test_x) from sklearn.metrics import confusion_matrix, precision_score, f1_score ovr_confusion_matrix = confusion_matrix(test_y.values.argmax(axis=1), ovr_predicted.argmax(axis=1)) precision_ovr = precision_score( test_y, ovr_predicted, average='micro') #gives 95.76 percent of accuracy precision_ovr_all = precision_score(test_y, ovr_predicted, average=None) #gives 94.98,96.52 f1_lgd = f1_score(test_y, ovr_predicted, average='micro') #gives 95.76 percent of accuracy #Applying SVM for multiclass classification from sklearn.svm import LinearSVC
#print(len(X_test)) #knnn.fit(X_train, y_train) #y_pred3 = knnn.predict(X_test) #accuracy3 = ((y_test==y_pred3).sum()/len(y_test)*100) #print('accuracy %.2f' % accuracy3) #print("---------Logestic Refression---------") #logreg = LogisticRegression(multi_class='auto') #logreg.fit(X_train, y_train) #y_pred4 = logreg.predict(X_test) #y_pred_lr_prob = logreg.predict_log_proba(X_test) #print(y_pred_lr_prob.shape) #print(y_pred_lr_prob) #accuracy4 = ((y_test==y_pred4).sum()/len(y_test)*100) print("---------Logestic Regression---------") logreg = LogisticRegression(multi_class='auto') parameter_grid = { 'tol': [0.0001, 0.001, 0.01, 0.1,1.0],'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter': [50,100,200,500,1000,200,5000,10000]} gs = GridSearchCV(logreg, param_grid = parameter_grid,cv = 3) gs.fit(X_train, y_train) clf = gs.best_estimator_ clf.fit(X_train, y_train) print('Train accuracy: %.3f' % gs.best_score_) print('Best Parameter: ', gs.best_params_) print('Test accuracy: %.3f' % clf.score(X_test, y_test)) print("--------- SVM ---------") svc = svm.SVC(kernel="linear", random_state=1, C=1) parameter_grid = [{'C': [0.0001, 0.001, 0.01, 0.1,1.0, 10.0, 50,100,200,500,1000], 'gamma': [0.0001, 0.001, 0.01, 0.1,1.0, 10.0, 50,100,200,500,1000],'kernel': ['rbf']}, {'C': [0.0001, 0.001, 0.01, 0.1,1.0, 10.0, 50,100,200,500,1000],'kernel': ['linear']}] gs = GridSearchCV(svc, param_grid = parameter_grid,cv = 3) gs.fit(X_train, y_train)
KNN Classifier """ from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski', p=2) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) from sklearn.metrics import accuracy_score Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) print(Accuracy*100,'%') """Logistic Regression""" from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) from sklearn.metrics import accuracy_score Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) print(Accuracy*100,'%') """Support Vector Machine""" from sklearn.svm import SVC classifier = SVC(kernel = 'linear') classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) from sklearn.metrics import accuracy_score Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) print(Accuracy*100,'%')
accuracy = cross_val_score(estimator, X_train, y_train, scoring='accuracy', cv=cv).mean() recall = cross_val_score(estimator, X_train, y_train, scoring='recall', cv=cv).mean() print("{}: auc:{:f}, recall:{:f}, accuracy:{:f}".format(name, auc, recall, accuracy)) # skplt.plot_learning_curve(estimator, X_train, y_train) # plt.show() # estimator.fit(X_train, y_train) # y_probas = estimator.predict_proba(X_train) # skplt.plot_roc_curve(y_true=y_train, y_probas=y_probas) # plt.show() estimate(XGBClassifier(learning_rate=0.1, n_estimators=20, objective='binary:logistic'), 'XGBClassifier') estimate(RidgeClassifier(), 'RidgeClassifier') estimate(LogisticRegression(), 'LogisticRegression') # estimate(RandomForestClassifier(), 'RandomForestClassifier') estimate(AdaBoostClassifier(), 'AdaBoostClassifier') # estimate(SVC(), 'SVC')# too long to wait # estimate(LinearSVC(), 'LinearSVC') # XGBClassifier: auc:0.747668, recall:0.000000, accuracy:0.944575 # RidgeClassifier: auc:0.754218, recall:0.000000, accuracy:0.944433 # LogisticRegression: auc:0.758454, recall:0.015424, accuracy:0.942010 # AdaBoostClassifier: auc:0.784086, recall:0.013495, accuracy:0.943791 from sklearn.ensemble import VotingClassifier estimators = [] # estimators.append(('RidgeClassifier', RidgeClassifier())) estimators.append(('LogisticRegression', LogisticRegression()))
#To try: XG Boost or boosting - tree tech. #""" # Compare Algorithms import matplotlib.pyplot as plt from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC # prepare configuration for cross validation test harness seed = 7 # prepare models models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) #rename data X=attribute Y=drugs.iloc[:,:17] # evaluate each model in turn results = [] names = [] scoring = 'accuracy' count=0 #n=0 #val=[]
# c40 += 1 x.append(list(dict['BlogVector'])) y.append(age) #print c13, c20, c30, c40 x_train = x[:16000] y_train = y[:16000] x_test = x[16000:] y_test = y[16000:] #print len(x) #print len(y) lr1_age_clf = LogisticRegression() lr1_age_clf.fit(x_train, y_train) blog = "" print lr1_age_clf.predict([blog]) #y_pred = lr1_age_clf.predict(x_test) from sklearn import metrics print "\nLogistic Regression Accuracy: ", lr1_age_clf.score(x_test, y_test) print "\nConfusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred, labels=['teens', 'twenties', 'thirties', 'forties']) print "\nClassification Report:\n", metrics.classification_report(y_test, y_pred, labels=['teens', 'twenties', 'thirties', 'forties'])
def main(): parser = argparse.ArgumentParser() ##required parameter parser.add_argument( "--memo", default='running Li model with data using adv training ', type=str, required=False) parser.add_argument("--model_name", type=str, default='bert_Li', required=False) parser.add_argument("--data_dir", type=str, default="./data/data_set", required=False) parser.add_argument("--out_dir", type=str, default="./Limodel_roberta_adv", required=False) parser.add_argument("--pretrained_model_path", type=str, default="./chinese_roberta_small", required=False) parser.add_argument("--to_resume_model", type=bool, default=False, required=False) parser.add_argument("--resume_model_path", type=str, default="./Limodel_roberta_adv/pytorch_model.bin", required=False) parser.add_argument("--num_labels", type=int, default=3, required=False) ##other parameters parser.add_argument("--output_hidden_states", type=bool, default=True, required=False) parser.add_argument("--do_kfold", action="store_true", default=False, required=False) parser.add_argument("--do_ensemble", action="store_true", default=False, required=False) parser.add_argument("--do_train", action="store_true", default=True, required=False) parser.add_argument("--do_eval", action="store_true", default=True, required=False) parser.add_argument("--test", action="store_true", default=True, required=False) parser.add_argument("--folds", type=int, default=5, required=False) parser.add_argument("--epochs", type=int, default=5, required=False) parser.add_argument("--weight", default=[1.0, 1.0, 1.0], type=list, required=False, help='the weight for crossentropy') parser.add_argument( "--ensemble_models", default=["bert_SPRNN", "bert_RCNN", "bert_RNN", "bert_CNN", "bert_Li"], required=False, help='the ensemble model names') parser.add_argument("--weight_list", default=[[1.0, 1.0, 1.0], [2.0, 1.0, 1.0], [2.0, 2.0, 1.0], [4.0, 2.0, 1.0], [1.0, 2.0, 1.0]], type=list, required=False, help='weight list used in the ensemble mode') parser.add_argument("--batch_size", type=int, default=32, required=False) parser.add_argument("--eval_batch_size", default=64, type=int, required=False) parser.add_argument("--max_seq_len", type=int, default=512, required=False) parser.add_argument("--title_seq_len", type=int, default=32, required=False) parser.add_argument("--content_seq_len", type=int, default=512, required=False) parser.add_argument("--no_cuda", default=False, action="store_true", required=False) parser.add_argument("--log_dir", default=None, type=str, required=False) parser.add_argument("--dev_loss", default=0, type=float, required=False) parser.add_argument("--seed", default=42, type=int, required=False) parser.add_argument("--do_lower_case", action="store_true", default=False, required=False) parser.add_argument("--optimize_steps", type=int, default=20000, required=False) parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="L2 regularization.") # parser.add_argument("--max_grad_norm", default=1.0, type=float, # help="Max gradient norm.") # parser.add_argument("--num_train_epochs", default=12.0, type=float, # help="Total number of training epochs to perform.") # parser.add_argument("--max_steps", default=-1, type=int, # help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--eval_steps", default=200, type=int, required=False, help="") parser.add_argument("--lstm_hidden_size", default=512, type=int, help="") parser.add_argument("--lstm_layers", default=1, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--linear_hidden_size", default=1024, type=float, help="") parser.add_argument("--kernel_sizes", default=[2, 3, 4, 5], type=list, help="set the kernel sizes for cnn model") parser.add_argument("--out_channels", default=256, type=int, help="set the out channel for cnn model") parser.add_argument("--kmax", default=2, type=float, help="set the features from kmax") parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list:" ) # parser.add_argument("--report_steps", default=-1, type=int, # help="") # parser.add_argument("--warmup_steps", default=0, type=int, # help="Linear warmup over warmup_steps.") # parser.add_argument("--split_num", default=3, type=int, # help="text split") # parser.add_argument('--logging_steps', type=int, default=50, # help="Log every X updates steps.") args = parser.parse_args() args.n_gpus = torch.cuda.device_count() if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) #prepare output directory if args.do_ensemble: stacker = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', class_weight={ 0: 5.0, 1: 1.0, 2: 1.0 }) stacking_models(args, stacker) return if args.do_kfold: kfold_train(args) return if args.do_train: train(args) args.do_train = False ### if args.do_eval: predict(args, is_eval=True) ##output the dev set result if args.test: predict(args)
previsores[:, 8] = labelencoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelencoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelencoder_previsores.fit_transform(previsores[:, 13]) onehotencoder = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])], # The column numbers to be transformed (here is [0] but can be [0, 1, 3]) remainder='passthrough' # Leave the rest of the columns untouched )#OneHotEncoder(categorical_features = [1,3,5,6,7,8,9,13]) previsores = onehotencoder.fit_transform(previsores).toarray() labelencoder_classe = LabelEncoder() classe = labelencoder_classe.fit_transform(classe) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.15, random_state=0) from sklearn.linear_model import LogisticRegression classificador = LogisticRegression() classificador.fit(previsores_treinamento, classe_treinamento) previsoes = classificador.predict(previsores_teste) from sklearn.metrics import confusion_matrix, accuracy_score precisao = accuracy_score(classe_teste, previsoes) matriz = confusion_matrix(classe_teste, previsoes) import collections collections.Counter(classe_teste)
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split target = [1 if i < 12500 else 0 for i in range(25000)] # split the train set into training data and validation data X_train, X_val, y_train, y_val = train_test_split( X, target, train_size=0.75 ) accuracy = [] regularization = [0.01, 0.05, 0.25, 0.5, 1] for c in regularization: lr = LogisticRegression(C=c) lr.fit(X_train, y_train) accuracy.append(accuracy_score(y_val, lr.predict(X_val))) print("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val)))) # train final model with highest c highestC = [regularization[i] for i in range(len(regularization)) if accuracy[i] == max(accuracy)][0] final_model = LogisticRegression(C = highestC) final_model.fit(X, target) y_pred = final_model.predict(X_test) print("Final Accuracy: %s" % accuracy_score(target, y_pred)) print(str(highestC)) accuracy.append(accuracy_score(target, final_model.predict(X_test))) regularization.append('testError C:' + str(highestC))
def return_model(mode, **kwargs): if inspect.isclass(mode): assert getattr(mode, 'fit', None) is not None, 'Custom model family should have a fit() method' model = mode(**kwargs) elif mode=='logistic': solver = kwargs.get('solver', 'liblinear') n_jobs = kwargs.get('n_jobs', None) max_iter = kwargs.get('max_iter', 5000) model = LogisticRegression(solver=solver, n_jobs=n_jobs, max_iter=max_iter, random_state=666) elif mode=='Tree': model = DecisionTreeClassifier(random_state=666) elif mode=='RandomForest': n_estimators = kwargs.get('n_estimators', 50) model = RandomForestClassifier(n_estimators=n_estimators, random_state=666) elif mode=='GB': n_estimators = kwargs.get('n_estimators', 50) model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666) elif mode=='AdaBoost': n_estimators = kwargs.get('n_estimators', 50) model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666) elif mode=='SVC': kernel = kwargs.get('kernel', 'rbf') model = SVC(kernel=kernel, random_state=666) elif mode=='LinearSVC': model = LinearSVC(loss='hinge', random_state=666) elif mode=='GP': model = GaussianProcessClassifier(random_state=666) elif mode=='KNN': n_neighbors = kwargs.get('n_neighbors', 5) model = KNeighborsClassifier(n_neighbors=n_neighbors) elif mode=='NB': model = MultinomialNB() elif mode=='linear': model = LinearRegression(random_state=666) elif mode=='ridge': alpha = kwargs.get('alpha', 1.0) model = Ridge(alpha=alpha, random_state=666) elif 'conv' in mode: tf.reset_default_graph() address = kwargs.get('address', 'weights/conv') hidden_units = kwargs.get('hidden_layer_sizes', [20]) activation = kwargs.get('activation', 'relu') weight_decay = kwargs.get('weight_decay', 1e-4) learning_rate = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 1000) early_stopping= kwargs.get('early_stopping', 10) warm_start = kwargs.get('warm_start', False) batch_size = kwargs.get('batch_size', 256) kernel_sizes = kwargs.get('kernel_sizes', [5]) strides = kwargs.get('strides', [5]) channels = kwargs.get('channels', [1]) validation_fraction = kwargs.get('validation_fraction', 0.) global_averaging = kwargs.get('global_averaging', 0.) optimizer = kwargs.get('optimizer', 'sgd') if mode=='conv': model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif mode=='conv_reg': model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter, learning_rate=learning_rate, weight_decay=weight_decay, validation_fraction=validation_fraction, early_stopping=early_stopping, optimizer=optimizer, warm_start=warm_start, address=address, hidden_units=hidden_units, strides=strides, global_averaging=global_averaging, kernel_sizes=kernel_sizes, channels=channels, random_seed=666) elif 'NN' in mode: solver = kwargs.get('solver', 'adam') hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,)) if isinstance(hidden_layer_sizes, list): hidden_layer_sizes = list(hidden_layer_sizes) activation = kwargs.get('activation', 'relu') learning_rate_init = kwargs.get('learning_rate', 0.001) max_iter = kwargs.get('max_iter', 5000) early_stopping= kwargs.get('early_stopping', False) warm_start = kwargs.get('warm_start', False) if mode=='NN': model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) if mode=='NN_reg': model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes, activation=activation, learning_rate_init=learning_rate_init, warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping) else: raise ValueError("Invalid mode!") return model
from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn import svm from sklearn.metrics import accuracy_score # Classifiers # 1 DecisionTreeClassifier clf_tree = tree.DecisionTreeClassifier() # 2 KNeighborsClassifier clf_neigh = KNeighborsClassifier() # 3 LogisticRegression clf_logReg = LogisticRegression() # 4 NaiveBayes clf_gnb = GaussianNB() # 5 SupportVectorMachine (SVM) clf_svm = svm.SVC() # Data set [height, weight, shoe size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male'
X = dataset.iloc[:,[2,3]] #ahora necesitamos el arreglo de las varibles dependientes Y = dataset.iloc[:,4] #dividimos el dataset en training set y test set from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.25,random_state=0) #escalando las variables from sklearn.preprocessing import StandardScaler sc_X=StandardScaler() X_train=sc_X.fit_transform(X_train) X_test=sc_X.transform(X_test) #regresion logistica para el set de entrenamiento from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state = 0) classifier.fit(X_train,Y_train) #predecir los resultados del test set y_pred = classifier.predict(X_test) #creando la matriz de confusion from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) #visualizando el set de entrenamiento from matplotlib.colors import ListedColormap X_set, y_set = X_test, Y_test X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
X, Y = data[list(range(4))], data[4] # 把文本数据进行编码,比如a b c编码为 0 1 2; 可以通过pd.Categorical(y).categories获取index对应的原始值 Y = pd.Categorical(Y).codes X = data[[0,1]] # 获取第1列和第二列 # 3.数据分割 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=0) # 4.模型构建及训练 # 4.1:SVM分类器 svm = SVC(C=1, kernel='linear') svm.fit(X_train, Y_train) # 4.2:LogisticRegression逻辑回归 lr = LogisticRegression() lr.fit(X_train, Y_train) # 4.3:RidgeClassifier岭回归 rc = RidgeClassifier() rc.fit(X_train, Y_train) # 4.4:KNN knn = KNeighborsClassifier() knn.fit(X_train, Y_train) # 5.模型评估 svm_score1 = accuracy_score(Y_train, svm.predict(X_train)) svm_score2 = accuracy_score(Y_test, svm.predict(X_test)) lr_score1 = accuracy_score(Y_train, lr.predict(X_train)) lr_score2 = accuracy_score(Y_test, lr.predict(X_test))
text.apply(lambda x: len(x.split(' '))).sum() ##### x_train, x_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state = 42) x_train.todense() #### from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import accuracy_score, confusion_matrix from sklearn.linear_model import LogisticRegression logreg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(n_jobs=1, C=1e5)), ]) logreg.fit(x_train, y_train) y_pred = logreg.predict(x_test) print('accuracy %s' % accuracy_score(y_pred, y_test)) print(classification_report(y_test, y_pred,target_names=["not CADR", "CADR"])) # accuracy 0.7647058823529411 # precision recall f1-score support # not CADR 0.66 0.81 0.72 26 # CADR 0.86 0.74 0.79 42