def main(argv): X = np.load('trainingdata2.npy') y = np.load('trainingdatalabel2.npy') labels = np.unique(y) logreg = LogisticRegression(C=1e5) logreg.fit(X,y) score1 = logreg.score(X,y) newval = createceps(sys.argv[1]) outp = logreg.predict(newval) clf = svm.SVC(kernel='rbf', C = 1.0) clf.fit(X,y) score2 = clf.score(X,y) confidence1 = logreg.decision_function(newval) confidence2 = clf.decision_function(newval) outp2 = clf.predict(newval) if outp[0]==1 : print "Speaker is Angry" image = Image.open('angry.png') image.show() elif outp[0] ==2 : print "Speaker is scared" image = Image.open('scared.png') image.show() elif outp[0] ==3 : print "Speaker is happy" image = Image.open('happy.jpg') image.show() elif outp[0] ==4 : print "Speaker is neutral" image = Image.open('neutral.png') image.show() elif outp[0] ==5 : print "Speaker is sad" image = Image.open('sad.jpg') image.show() if outp2[0]==1 : print "Speaker is Angry" image = Image.open('angry.png') image.show() elif outp2[0] ==2 : print "Speaker is scared" image = Image.open('scared.png') image.show() elif outp2[0] ==3 : print "Speaker is happy" image = Image.open('happy.jpg') image.show() elif outp2[0] ==4 : print "Speaker is neutral" image = Image.open('neutral.png') image.show() elif outp2[0] ==5 : print "Speaker is sad" image = Image.open('sad.jpg') image.show() print "Accuracy of logistic regression %f"%score1 print confidence1[0] print "Accuracy of SVM Classifier %f"%score2 print confidence2[0]
def test_thresholded_scorers(): """Test scorers that take thresholds.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = SCORERS['log_loss'](clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
class LogisticRegressionAdaptive(LogisticRegression): def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None): super(LogisticRegressionAdaptive,self).__init__( penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state) self.clf = LogisticRegression( penalty=penalty, dual=dual, tol=tol, C=10, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state) self.c_average = [] def fit(self, X, y): kcv = StratifiedKFold(y=y, n_folds=5, shuffle=False, random_state=None) # Set the parameters by cross-validation tuned_parameters = [{'C': [pow(10,x) for x in range(-2,3)]}] #[0.001, 0.01, 0.1, 1, 10, 100, 1000] score = 'accuracy' clf = GridSearchCV(self.clf, tuned_parameters, scoring=score, cv=kcv) clf.fit(X, y) self.clf.C =clf.best_estimator_.C self.clf.fit(X,y) self.C = clf.best_estimator_.C self.c_average.append(self.C) # print "best:", super(LogisticRegressionAdaptive,self).fit(X,y) return self def get_c_ave(self): # import numpy as np return self.c_average def predict(self, X): return self.clf.predict(X) def predict_proba(self,X): return self.clf.predict_proba(X) def decision_function(self, X): return self.clf.decision_function(X) def transform(self, X, threshold=None): return self.clf.transform(X, threshold=threshold) def __repr__(self): return "%s - %s" % (self.__class__.__name__,self.clf)
def lr_classify(self): print "Logistic Regression" clf = LogisticRegression() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) print "Mean : %3f" % mean print "Coefficients ", clf.coef_ print "Intercept ", clf.intercept_ print "Confidence Score ",clf.decision_function(self.descr) print "Predict Probability ", clf.predict_proba(self.descr) print "Transform ", clf.transform(self.descr)
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('neg_log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) with pytest.raises(ValueError, match="multiclass format is not supported"): get_scorer('roc_auc')(clf, X_test, y_test) # test error is raised with a single class present in model # (predict_proba shape is not suitable for binary auc) X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier() clf.fit(X_train, np.zeros_like(y_train)) with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('roc_auc')(clf, X_test, y_test) # for proba scorers with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('neg_log_loss')(clf, X_test, y_test)
def train_custom_one_vs_all(X_train,X_test,Y_train,topk): #convert matrix to row for efficient splicing Y_train = Y_train.tocsc() tag_classifiers = [] num_training,numclasses = Y_train.shape num_test_examples = X_test.shape[0] # hold a vector mxk, containing top k prediction classes for each example, maintain m heaps for that num_examples = X_test.shape[0] num_classes = len(tag_classifiers) topk_class_distances = [] for i in xrange(num_examples): heap = [] topk_class_distances += [heap] for j in xrange(numclasses): # train on each class label for all the training examples y = numpy.ravel(Y_train.getcol(j).todense()); clf = LogisticRegression(penalty='l2',dual=False,tol=0.0001,C=0.8,fit_intercept=True,intercept_scaling=1) clf.fit(X_train,y); print "Trained for class",j # get the decision for all test examples decision = clf.decision_function(X_test) # for each test example add its decision value to the heap of top k decision values for i in xrange(num_test_examples): h = topk_class_distances[i] if len(h) < topk: heapq.heappush(h,(decision[i],j)) else: heapq.heappushpop(h,(decision[i],j)) print "Predicted for class",j #clean the decision values and store the class labels class_label_indices = [] for i in xrange(num_examples): topk_labels = [label for dist,label in topk_class_distances[i]] class_label_indices += [topk_labels] return class_label_indices
def main(): fpath_train = "/Users/archana/Desktop/PhD/Code/PrivacyAlert/data/CurrentProcessingFiles/FinalProductionFiles/TestTrainData/MaritalTrainData.txt" bunch_train = bunchcreator.LoadFileAsBunch(fpath_train, ["NoMarital", "Marital"]) fpath_test = "/Users/archana/Desktop/PhD/Code/PrivacyAlert/data/CurrentProcessingFiles/FinalProductionFiles/TestTrainData/MaritalTestData.txt" bunch_test = bunchcreator.LoadFileAsBunch(fpath_test, ["NoMarital", "Marital"]) print("Done with Bunching"); count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(bunch_train.data) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(bunch_test.data) X_test_tfidf = tfidf_transformer.transform(X_test_counts) print "Done with TFIDF" clf = LogisticRegression() clf.fit(X_train_tfidf, bunch_train.target) preds_int = clf.predict(X_test_tfidf) preds_float = clf.decision_function(X_test_tfidf) y_true = np.array(bunch_test.target) filepath = "/Users/archana/Desktop/PhD/Code/PrivacyAlert/data/CurrentProcessingFiles/FinalProductionFiles/OutputFiles/GT_Pred.txt" fw = open(filepath, 'w') for i in range(len(bunch_test.target)): fw.write(str(bunch_test.target[i])+":"+str(preds_int[i])+":"+str(preds_float[i])+"\n") fpr, tpr, _ = metrics.roc_curve(y_true, preds_float) plt.figure() plt.plot(fpr, tpr, label='ROC curve ' ) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()
(y_p_cv_train, y_purturb_cv_train), axis=0) scaler = StandardScaler().fit(X_cv) X_cv_transformed = scaler.transform(X_cv) X_pu_cv_test_transformed = scaler.transform( X_pu_cv_test) clf = LogisticRegression(penalty="l2", C=c, class_weight={ -1: 1, 1: r }, random_state=i) clf.fit(X_cv_transformed, y_cv) scores = clf.decision_function( X_pu_cv_test_transformed) #print("scores.shape:", scores.shape) #next: accumulate the score and count properly, that's why we use ShuffleSplit accScores[test_bstrp_index] += scores timesClassified[test_bstrp_index] += 1 #print("Log: finished %d/%d, time elapsed: %.2f" %(i, T, elapsed_time) ) nUnclassified = np.sum(timesClassified == 0) avgScores = accScores / timesClassified orderAvgScores = np.argsort( -avgScores) #sort in descent order topNIndex = orderAvgScores[:topN] truePosIndex = np.array( range(y_p_cv_val.shape[0]) ) #they are the firstN rows in the concatenated validation set truePosRecall = np.intersect1d(topNIndex, truePosIndex,
# In order to do this, we can lower the threshold for predicting class 1. # This will reduce our false negative rate to 0, but at the expense of our false positive rate. Y_pp['pred_class_thresh10'] = [1 if x >= 0.10 else 0 for x in Y_pp.class_1_pp.values] print(Y_pp.iloc[0:10]) from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt # plt.style.use('seaborn-white') %matplotlib inline Y_score = logreg.decision_function(X_test) FPR = dict() TPR = dict() ROC_AUC = dict() # For class 1, find the area under the curve FPR[1], TPR[1], _ = roc_curve(Y_test, Y_score) ROC_AUC[1] = auc(FPR[1], TPR[1]) # Plot of a ROC curve for class 1 (has_cancer) plt.figure(figsize=[11,9]) plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4) plt.plot([0, 1], [0, 1], 'k--', linewidth=4) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05])
train_data, test_data = products.random_split(.8, seed=1) vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') train_matrix = vectorizer.fit_transform(train_data['review_clean']) test_matrix = vectorizer.transform(test_data['review_clean']) print(test_matrix[0]) model = LogisticRegression() model.fit(train_matrix, train_data['sentiment']) sample_test_matrix = vectorizer.transform(['ammazing wow wow']) print(sample_test_matrix) model.decision_function(sample_test_matrix) from sframe import SArray def my_predictions(model, test_matrix): return SArray([+1 if s >= 0 else -1 for s in model.decision_function(test_matrix)]) print (my_predictions(model, sample_test_matrix)) print (SArray(model.predict(sample_test_matrix))) #import pickle #pickle.dumps(model) from sklearn.externals import joblib joblib.dump(model, 'yelp_model.pkl') joblib.dump(vectorizer.vocabulary_, 'yelp_vocabulary_.pkl')
from sklearn import metrics from itertools import cycle if __name__ == '__main__': np.random.seed(0) pd.set_option('display.width', 300) np.set_printoptions(suppress=True, linewidth=200) n = 300 x = np.random.randn(n, 50) y = np.array([0] * 100 + [1] * 100 + [2] * 100) n_class = 3 print('Before = \n', y) clf = LogisticRegression(penalty='l2', C=1) clf.fit(x, y) y_score = clf.decision_function(x) y = label_binarize(y, classes=np.arange(n_class)) print('After = \n', y) colors = cycle('gbc') fpr = dict() tpr = dict() auc = np.empty(n_class + 2) mpl.rcParams['font.sans-serif'] = 'SimHei' mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(7, 6), facecolor='w') for i, color in zip(np.arange(n_class), colors): fpr[i], tpr[i], thresholds = metrics.roc_curve(y[:, i], y_score[:, i]) auc[i] = metrics.auc(fpr[i], tpr[i]) plt.plot(fpr[i], tpr[i], c=color,
count = 0 for co in coefs: if co >= 0: count += 1 print "Number of non negative coeffs ", count sample_test_data = test_data[10:13] print sample_test_data def probability(score): return (1 / (1 + numpy.exp(-score))) sample_test_matrix = vectorizer.transform(sample_test_data['review_clean']) scores = sentiment_model.decision_function(sample_test_matrix) print scores print sentiment_model.predict(sample_test_matrix) test_set_scores = sentiment_model.decision_function(test_matrix) names = test_data["name"] name_predictions = dict(zip(names, test_set_scores)) sorted_reviews = sorted(name_predictions.items(), key=operator.itemgetter(1), reverse=True) most_positive_reviews = sorted_reviews[:20] print most_positive_reviews most_negative_reviews = sorted_reviews[-1:-22:-1] print most_negative_reviews
def runLogis(label, Xdata, ydata, XNoLabel, testcase, debug): print("-------------- ") print(label) print("---------------") print("X(shape): ", Xdata.shape) print("y(shape): ", ydata.shape) if debug == 1: print("type(X): ", type(Xdata)) print("type(y): ", type(ydata)) print("-------------------------") if debug == 1: print("X:") print(Xdata) print("-------------------------") print("y:") print(ydata) print("-------------------------") print("-------------------------") lr = LogisticRegression(C = 1.0) lr.fit(Xdata, ydata) print("\n") print(lr.fit(Xdata, ydata)) print("-------------------------") print("prediction probabilities of X:") print("The returned estimates for all classes are ordered by the label of classes") print("number of samples x number of classes (2 if 0-1)") lr.predict_proba(Xdata) print(lr.predict_proba(Xdata)) print("-------------------------") print("Predict confidence scores for samples.") print("The confidence score for a sample is the signed distance of that sample to the hyperplane") print("Confidence scores per (sample, class) combination. In the binary case, ") print("confidence score for self.classes_[1] where >0 means this class would be predicted.") print(" ") print(lr.decision_function(Xdata)) print("-------------------------") print("regression coefficients shape[n_classes-1, n_features]") #print(lr.coef_) print("-------------------------") print("params: ") print(lr.get_params(deep=True)) print("-------------------------") print("fit_transform: ") print("Fits transformer to X and y with optional parameters fit_params and returns a transformed version of X") print(lr.fit_transform(Xdata, ydata)) print("-------------------------") print("scores, Returns the mean accuracy on the given data and labels: ") print(lr.score(Xdata, ydata)) print("-------------------------") if testcase == 1: print("XNoLabel:") #print("shape(XNoLabel)", shape.XNoLabel ) print(XNoLabel) predY = lr.predict(XNoLabel) print("-------------------------") print("Predict class labels for samples in X ") print("len(lr.predict(XNoLabel))", len(predY) ) print("type(lr.predict(XNoLabel))", type(predY) ) print(" ") print("predY:") print(predY[0:20]) return predY
def predictiveModeling(): print ("training a predictive model...") try: # split the data into a training set and a test set train_split = int(len(data) * 4.0 / 5.0) X_train = X[:train_split] X_test = X[train_split:] y_train = y[:train_split] y_test = y[train_split:] # if you wanted to use a different model, you'd specify that here clf = LogisticRegression(penalty="l2") clf.fit(X_train, y_train) print "score", clf.score(X_test, y_test) # first, let's find the model score for every dress in our dataset probs = zip(clf.decision_function(X), raw_data) prettiest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "like" else 1, p)) prettiest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "dislike" else 1, p)) ugliest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "like" else 1, -p)) ugliest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == "dislike" else 1, -p)) in_between_things = sorted(probs, key=lambda (p, (cd, g, f)): abs(p)) # and let's look at the most and least extreme dresses cd = zip(X, raw_data) least_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x])) most_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x]), reverse=True) least_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): max([abs(c) for c in x])) most_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): min([abs(c) for c in x]), reverse=True) directory = "results/notableDresses/" makeFolder(directory) for i in range(min(N_COMPONENTS_TO_SHOW, numComponents)): Image.open(prettiest_liked_things[i][1][2]).save(directory + "prettiest_pretty_" + str(i) + ".png") Image.open(prettiest_disliked_things[i][1][2]).save(directory + "prettiest_ugly_" + str(i) + ".png") Image.open(ugliest_liked_things[i][1][2]).save(directory + "ugliest_pretty_" + str(i) + ".png") Image.open(ugliest_disliked_things[i][1][2]).save(directory + "directoryugliest_ugly_" + str(i) + ".png") Image.open(in_between_things[i][1][2]).save(directory + "neither_pretty_nor_ugly_" + str(i) + ".png") Image.open(least_extreme_things[i][1][2]).save(directory + "least_extreme_" + str(i) + ".png") Image.open(most_extreme_things[i][1][2]).save(directory + "most_extreme_" + str(i) + ".png") Image.open(least_interesting_things[i][1][2]).save(directory + "least_interesting_" + str(i) + ".png") Image.open(most_interesting_things[i][1][2]).save(directory + "most_interesting_" + str(i) + ".png") # and now let's look at precision-recall probs = zip(clf.decision_function(X_test), raw_data[train_split:]) num_dislikes = len([c for c in y_test if c == 1]) num_likes = len([c for c in y_test if c == 0]) lowest_score = round(min([p[0] for p in probs]), 1) - 0.1 highest_score = round(max([p[0] for p in probs]), 1) + 0.1 INTERVAL = 0.1 # first do the likes score = lowest_score while score <= highest_score: true_positives = len([p for p in probs if p[0] <= score and p[1][1] == "like"]) false_positives = len([p for p in probs if p[0] <= score and p[1][1] == "dislike"]) positives = true_positives + false_positives precision = np.float64(1.0 * true_positives) / positives recall = np.float64(1.0 * true_positives) / num_likes print "likes", score, precision, recall score += INTERVAL # then do the dislikes score = highest_score while score >= lowest_score: true_positives = len([p for p in probs if p[0] >= score and p[1][1] == "dislike"]) false_positives = len([p for p in probs if p[0] >= score and p[1][1] == "like"]) positives = true_positives + false_positives precision = np.float64(1.0 * true_positives) / positives recall = np.float64(1.0 * true_positives) / num_dislikes print "dislikes", score, precision, recall score -= INTERVAL # now do both score = lowest_score while score <= highest_score: likes = len([p for p in probs if p[0] <= score and p[1][1] == "like"]) dislikes = len([p for p in probs if p[0] <= score and p[1][1] == "dislike"]) print score, likes, dislikes score += INTERVAL except: print ("the model could not be trained.")
vectorizer = CountVectorizer(token_pattern=r"\b\w+\b") train_matrix = vectorizer.fit_transform(train_data["review_clean"]) test_matrix = vectorizer.transform(test_data["review_clean"]) words = vectorizer.get_feature_names() # Create a logistic regression model sentiment_model = LogisticRegression() sentiment_model.fit(train_matrix, train_data["sentiment"]) # Create a SFrame with words and their corresponding coefficient sentiment_model_coef_table = sframe.SFrame({"word": words, "coefficient": sentiment_model.coef_.flatten()}) # Sanity check using some sample data sample_test_data = test_data[10:13] sample_test_matrix = vectorizer.transform(sample_test_data["review_clean"]) sample_test_scores = sentiment_model.decision_function(sample_test_matrix) sample_test_probabilities = sigmoid(sample_test_scores) # Apply the logistic regression model on the test matrix # Compute scores, compute probabilities, compute predicted sentiment test_scores = sentiment_model.decision_function(test_matrix) test_probabilities = sigmoid(test_scores) test_data["probability"] = test_probabilities test_data["predicted_score"] = test_scores test_data["predicted_sentiment"] = test_data["predicted_score"].apply(lambda score: +1 if score > 0.0 else -1) # Sort the test data on the predicted probability # Get the likely products for the most positive and most negative reviews test_data.sort("probability", ascending=False)["name"][0:20] test_data.sort("probability", ascending=True)["name"][0:20]
from itertools import cycle if __name__ == '__main__': np.random.seed(0) pd.set_option('display.width', 300) np.set_printoptions(suppress=True, linewidth=200) n = 300 x = np.random.randn(n, 50) y = np.array([0]*100+[1]*100+[2]*100) n_class = 3 print 'Before = \n', y clf = LogisticRegression(penalty='l2', C=1) clf.fit(x, y) y_score = clf.decision_function(x) y = label_binarize(y, classes=np.arange(n_class)) print 'After = \n', y colors = cycle('gbc') fpr = dict() tpr = dict() auc = np.empty(n_class+2) mpl.rcParams['font.sans-serif'] = u'SimHei' mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(7, 6), facecolor='w') for i, color in zip(np.arange(n_class), colors): fpr[i], tpr[i], thresholds = metrics.roc_curve(y[:, i], y_score[:, i]) auc[i] = metrics.auc(fpr[i], tpr[i]) plt.plot(fpr[i], tpr[i], c=color, lw=1.5, alpha=0.7, label=u'AUC=%.3f' % auc[i]) # micro fpr['micro'], tpr['micro'], thresholds = metrics.roc_curve(y.ravel(), y_score.ravel())
def predictive_modeling(raw_data, y): print("logistic regression...") directory = "results/notableDresses/" make_folder(directory) # split the data into a training set and a test set train_split = int(len(raw_data) * 4.0 / 5.0) x_train = X[:train_split] x_test = X[train_split:] y_train = y[:train_split] y_test = y[train_split:] # if you wanted to use a different model, you'd specify that here clf = LogisticRegression(penalty='l2') clf.fit(x_train, y_train) print "score", clf.score(x_test, y_test) # first, let's find the model score for every dress in our dataset probs = zip(clf.decision_function(X), raw_data) prettiest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == LIKE else 1, p)) prettiest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == DISLIKE else 1, p)) ugliest_liked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == LIKE else 1, -p)) ugliest_disliked_things = sorted(probs, key=lambda (p, (cd, g, f)): (0 if g == DISLIKE else 1, -p)) in_between_things = sorted(probs, key=lambda (p, (cd, g, f)): abs(p)) # and let's look at the most and least extreme dresses cd = zip(X, raw_data) least_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x])) most_extreme_things = sorted(cd, key=lambda (x, (d, g, f)): sum([abs(c) for c in x]), reverse=True) least_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): max([abs(c) for c in x])) most_interesting_things = sorted(cd, key=lambda (x, (d, g, f)): min([abs(c) for c in x]), reverse=True) for i in range(10): open_image_from_url(prettiest_liked_things[i][1][2]).save(directory + "prettiest_pretty_" + str(i) + ".png") open_image_from_url(prettiest_disliked_things[i][1][2]).save(directory + "prettiest_ugly_" + str(i) + ".png") open_image_from_url(ugliest_liked_things[i][1][2]).save(directory + "ugliest_pretty_" + str(i) + ".png") open_image_from_url(ugliest_disliked_things[i][1][2]).save( directory + "directoryugliest_ugly_" + str(i) + ".png") open_image_from_url(in_between_things[i][1][2]).save(directory + "neither_pretty_nor_ugly_" + str(i) + ".png") open_image_from_url(least_extreme_things[i][1][2]).save(directory + "least_extreme_" + str(i) + ".png") open_image_from_url(most_extreme_things[i][1][2]).save(directory + "most_extreme_" + str(i) + ".png") open_image_from_url(least_interesting_things[i][1][2]).save(directory + "least_interesting_" + str(i) + ".png") open_image_from_url(most_interesting_things[i][1][2]).save(directory + "most_interesting_" + str(i) + ".png") # and now let's look at precision-recall probs = zip(clf.decision_function(x_test), raw_data[train_split:]) num_dislikes = len([c for c in y_test if c == 1]) num_likes = len([c for c in y_test if c == 0]) lowest_score = round(min([p[0] for p in probs]), 1) - 0.1 highest_score = round(max([p[0] for p in probs]), 1) + 0.1 INTERVAL = 0.1 # first do the likes score = lowest_score while score <= highest_score: true_positives = len([p for p in probs if p[0] <= score and p[1][1] == LIKE]) false_positives = len([p for p in probs if p[0] <= score and p[1][1] == DISLIKE]) positives = true_positives + false_positives if positives > 0: precision = 1.0 * true_positives / positives recall = 1.0 * true_positives / num_likes print "likes", score, precision, recall score += INTERVAL # then do the dislikes score = highest_score while score >= lowest_score: true_positives = len([p for p in probs if p[0] >= score and p[1][1] == DISLIKE]) false_positives = len([p for p in probs if p[0] >= score and p[1][1] == LIKE]) positives = true_positives + false_positives if positives > 0: precision = 1.0 * true_positives / positives recall = 1.0 * true_positives / num_dislikes print "dislikes", score, precision, recall score -= INTERVAL # now do both score = lowest_score while score <= highest_score: likes = len([p for p in probs if p[0] <= score and p[1][1] == LIKE]) dislikes = len([p for p in probs if p[0] <= score and p[1][1] == DISLIKE]) print score, likes, dislikes score += INTERVAL
### text feature clf_t = LinearSVC(C=0.04) clf_t.fit(Xtr_t[:,:,thres-1],ytrain) ### audio feature clf_a = LogisticRegression(C=0.001) clf_a.fit(Xtr_a[:,:,thres-1],ytrain) ### video feature clf_v = SVC(gamma=0.001,C=10) clf_v.fit(Xtr_v[:,:,thres-1],ytrain) ypr_a = clf_a.predict(Xts_a[:,:,thres-1]) yscore_a = clf_a.decision_function(Xts_a[:,:,thres-1]) ypr_v = clf_v.predict(Xts_v[:,:,thres-1]) yscore_v = clf_v.decision_function(Xts_v[:,:,thres-1]) ypr_t = clf_t.predict(Xts_t[:,:,thres-1]) """ ### feature for fusion ### audio RL feature clf_a = LogisticRegression(C=0.001) ypreds_a, yscores_a, ytests_a = UncertaintyStats(Xtr_a,ytrain,pred_sec_lst,clf_a) ### video RL feature clf_v = SVC(gamma=0.001,C=10) ypreds_v, yscores_v, ytests_v = UncertaintyStats(Xtr_v,ytrain,pred_sec_lst,clf_v) Xfuse = np.c_[ypreds_a,yscores_a,ypreds_v,yscores_v] yfuse = ytests_v clf_fuse = LinearSVC(C=1)
train_matrix = vectorizer.fit_transform(train_data["review_clean"]) test_matrix = vectorizer.transform(test_data["review_clean"]) #train LogisticRegression model train_y = train_data["sentiment"] model = LogisticRegression() model.fit(train_matrix, train_y) coefficients = model.coef_ #Quiz question: How many weights are >= 0? num = len([weight for weight in coefficients[0] if weight >= 0]) print "Number of positive weights: %d"%num, "\n" #test prediction for 3 sample dataset sample_test_data = test_data[10:13] sample_test_matrix = vectorizer.transform(sample_test_data['review_clean']) scores = model.decision_function(sample_test_matrix) #Quiz question: Of the three data points in sample_test_data, which one #(first, second, or third) has the lowest probability of being classified #as a positive review? probs = prob_prediction(scores) print "The probabilities: ", probs, "\n" #find the 20 reviews in the entire test_data with the highest probability #of being classified as a positive review. scores = model.decision_function(test_matrix) probs = prob_prediction(scores) index_probs = zip(test_index, probs) index_probs.sort(key = lambda x: x[1]) most_positive_reviews_index = [a for (a, b) in index_probs[-20:]] most_positive_reviews = products.iloc[most_positive_reviews_index, 0] print "Most positive reviews: ", most_positive_reviews, "\n"
from sklearn import svm X = [[0, 0,0.5], [1, 1,1.5],[2,3,4]] y = [0,1,1] clf = svm.SVC() clf.fit(X,y) #_*_coding:utf-8-* from sklearn import svm X = [[0, 0,0.5], [1,1,1.5],[2,3,4],[0,0,0.4]] y = [0,1,2,0] clf = svm.SVC() clf.fit(X,y) dec = clf.decision_function([[1,1,1.5]]) print dec import numpy as np from sklearn.lda import LDA X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2],[4,3]]) y = np.array([1, 1, 1, 2, 2, 3,3]) clf = LDA() clf.fit(X, y) LDA(n_components=None, priors=None) print(clf.predict([[-0.8, -1],[3,4]])) # sift 算法 import numpy as np import cv2 from matplotlib import pyplot as plt
# In[34]: predict_y=clf.predict(test_x) # In[35]: #预测样本的置信分数 # In[36]: score_y=clf.decision_function(test_x) # In[37]: #计算混淆矩阵,并显示 # In[38]: cm=confusion_matrix(test_y,predict_y) # In[39]:
# 분류 연습용 샘플 데이터 작성 x, y = make_classification(n_samples=16, n_features=2, n_informative=2, n_redundant=0, random_state=0) # n_samples 표본데이터수 n_features 독립변수의 수 n_informative 독립 변수 중 종속 변수와 상관 관계가 있는 성분의 수 n_redundant : 독립 변수 중 다른 독립 변수의 선형 조합으로 나타나는 성분의 수 random_state 난수고정 # print(x) #[[ 2.03418291 -0.38437236] [ 4.06377686 0.17863836] ... # print(y) #[0 1 0 1 1 0 0 0 1 0 1 0 1 1 0 1] 실제값 model = LogisticRegression().fit(x, y) y_hat = model.predict(x) print('y_hat :', y_hat) #예측값 f_value = model.decision_function(x) # 결정(판별)함수, 불확실성 측정 함수 print(f_value) #[ 0.37829565 1.6336573 -1.42938156 1.21967832 .... print() df = pd.DataFrame(np.vstack([f_value, y_hat, y]).T, columns=['f', 'yhat', 'y']) print(df) # 0보다 작으면 0, 0보다 크면 1로 예측한 것 yhat print() # ROC 커브 from sklearn.metrics import confusion_matrix print(confusion_matrix(y, y_hat, labels=[1, 0])) recall = 7 / ( 7 + 1 ) # 민감도=재현율=참 양성 비율(TPR) = TP / TP+FN :정답이 Positive인 것들 중에서 정말로 정답을 맞춘 수의 비율 -> y축이된다. fallout = 1 / (
#print(X_train,y_train) from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score log = LogisticRegression() log.fit(X_train, y_train) predict_y_test = log.predict(X_test) print(predict_y_test) print(accuracy_score(y_test, predict_y_test)) #confussion matrix from sklearn.metrics import confusion_matrix con = confusion_matrix(y_test, predict_y_test) print(con) predict_prob_y_test = log.decision_function(X_test) # keep probabilities for the positive outcome only (or) both can be used same resukts #predict_prob_y_test = log.predict_proba(X_test) #predict_prob_y_test =predict_prob_y_test[:, 1] #data for the roc curve from sklearn.metrics import roc_curve from sklearn.metrics import roc_auc_score false_positive_rate, true_positive_rate, threshold = roc_curve( y_test, predict_prob_y_test) #plot the roc-auc curve plt.plot(false_positive_rate, true_positive_rate) plt.plot([0, 1], [0, 1]) plt.xlabel('Flase positive rate') plt.ylabel('True pasitive rate') plt.xlim(0.0, 1.0)
ax.set_xlabel('Predicted label') ax.set_ylabel('Actual label') 'Accuracy Score: {0}'.format(lrc_acc_score_test) ax.set_title('CC Testing Data CM Using Logistic Regression', size=15) plt.show() print('LR Metrics Class Wise') report = classification_report(y_test, lbl_predictions_test) print(report) print("Logistic Regression Training Data accuracy {0:.2f}".format( lrc_acc_score_train)) print("Logistic Regression Testing Data accuracy {0:.2f}".format( lrc_acc_score_test)) y_score_test = clf.decision_function(X_test) fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_score_test) y_score_train = clf.decision_function(X_train) fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_score_train) #Plotting ROC Curve plt.figure() lw = 2 plt.plot(fpr_train, tpr_train, color='darkorange', lw=lw, label='train') plt.plot(fpr_test, tpr_test, color='navy', lw=lw, linestyle='--', label='test') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve LR')
html += '<tr>' if pct >= 0: html += '<td width="50%"></td><td width="50%"><div style="width:' + width + ';background-color:rgb(' + str(r) + "," + str(g) + "," + str(b) + ')">' + str(pct) + "</td></div>" else: html += '<td width="50%"><div style="text-align:right;float:right;width:' + width + ';background-color:rgb(' + str(r) + "," + str(g) + "," + str(b) + ')">' + str(pct) + '</td></div><td width="50%"></td>' html += '</tr>' html += "</table></body></html>" f = open("html.html","w") f.write(html) f.close() # and now some qualitative results # first, let's find the model score for every shirt in our dataset probs = zip(clf.decision_function(X),data) girliest_girl_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (g,p))[0] girliest_boy_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (-g,p))[0] boyiest_girl_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (g,-p))[0] boyiest_boy_shirt = sorted(probs,key=lambda (p,(cd,g,f)): (-g,-p))[0] most_androgynous_shirt = sorted(probs,key=lambda (p,(cd,g,f)): abs(p))[0] blandest = sorted(probs,key=lambda (p,(cd,g,f)): sum(cd))[0] coloriest = sorted(probs,key=lambda (p,(cd,g,f)): -sum(cd))[0] # and now let's look at precision-recall probs = zip(clf.decision_function(X_test),data[train_split:]) num_boys = len([c for c in y_test if c == 1]) num_girls = len([c for c in y_test if c == 0]) lowest_score = round(min([p[0] for p in probs]),1) - 0.1 highest_score = round(max([p[0] for p in probs]),1) + 0.1
# %% print(accuracy_score(y_test, y_lr_pred)) print(precision_score(y_test, y_lr_pred)) print(recall_score(y_test, y_lr_pred)) print(f1_score(y_test, y_lr_pred)) # %% # %% compute_plot_grid_coords(x, 2) # %% lr_clf.predict(compute_plot_grid_coords(x, 2)) # %% lr_clf.decision_function(compute_plot_grid_coords(x, 2)) # %% lr_clf.decision_function(compute_plot_grid_coords(x, 2)) > -3 # noqa # %% lr_clf.predict_proba(compute_plot_grid_coords(x, 2)) # %% lr_clf.predict_proba(compute_plot_grid_coords(x, 2))[:, 1] # %% grid_x, grid_y = compute_plot_grid(x, 0.02) grid_x.shape, grid_y.shape # %%
Created on May 27, 2012 @author: sijin ''' import numpy as np from sklearn.preprocessing import Scaler from sklearn.linear_model import LogisticRegression with open('../../data/quora/input00.txt') as f: mn = f.readline().split(' ') N, M = int(mn[0]), int(mn[1]) print 'M, N = {}, {}'.format(M, N) X = np.zeros((N, M)) Y = np.zeros(N, np.int) for row in range(N): training_data = f.readline().strip().split(' ') Y[row] = 1 if training_data[1] == '+1' else -1 for col in range(2, 2+M): X[row, col-2] = training_data[col].split(':')[1] X = Scaler().fit_transform(X) clf_l1_LR = LogisticRegression(C=0.01, penalty='l1', tol=0.01) clf_l1_LR.fit(X, Y) print clf_l1_LR.decision_function(X) if __name__ == '__main__': pass
fillstyle="none", c='k', mew=2) ax.plot(precision, recall, label='precision recall curve') ax.set_xlabel('Precision') ax.set_ylabel('Recall') #ax.set_aspect(1) #ax.axis([0,1,0,1]) ''' ROC - Receiver operating characteristic ''' from sklearn.metrics import roc_curve fpr, tpr, threshold = roc_curve(y_test, logreg.decision_function(X_test), drop_intermediate=False) plt.plot(fpr, tpr, label='ROC Curve') plt.xlabel('FPR') plt.ylabel('TPR') close_zero = np.argmin(np.abs(threshold)) plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label='threshold', fillstyle='none', c='k', mew=2)
#Calculate the number of positive (>= 0, nonnegative) coeffs cntnonneg=np.sum(sentiment_model.coef_>=0)+np.sum(sentiment_model.intercept_ >0) #Making predictions with logistic regression #Take the 11th, 12th, and 13th data points in the test data and save them to #sample_test_data sample_test_data = test_data.iloc[10:13] sample_test_data.iloc[0]['review'] sample_test_data.iloc[1]['review'] #The sentiment_model should predict +1 if the sentiment is positive #-1 if the sentiment is negative sample_test_matrix = vectorizer.transform(sample_test_data['review_clean']) #calculate the score of each data point with decision_function() scores = sentiment_model.decision_function(sample_test_matrix) #WTransh(X) print (scores) #Prediciting Sentiment #make class predictions from scores def predictions(scores): """ make class predictions """ preds = [] for score in scores: if score > 0: pred = 1 else: pred = -1 preds.append(pred) return preds
def regressionModel(df_sig_train, df_bkg_train, df_sig_test, df_bkg_test): # Reminder: # LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, # intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001) #df_sig_train['X1X2'] = df_sig_train['PTS']*df_sig_train['AST'] #df_sig_train['X1X1'] = df_sig_train['PTS']*df_sig_train['PTS'] #df_sig_train['X2X2'] = df_sig_train['AST']*df_sig_train['AST'] #df_bkg_train['X1X2'] = df_bkg_train['PTS']*df_bkg_train['AST'] #df_bkg_train['X1X1'] = df_bkg_train['PTS']*df_bkg_train['PTS'] #df_bkg_train['X2X2'] = df_bkg_train['AST']*df_bkg_train['AST'] # '---------- Prepare Training ----------' X_sig = np.array(df_sig_train) y_sig = np.array(X_sig.shape[0] * [1]) X_bkg = np.array(df_bkg_train) y_bkg = np.array(X_bkg.shape[0] * [0]) X = np.concatenate((X_sig, X_bkg)) y = np.concatenate((y_sig, y_bkg)) print 'X_sig.shape: ', X_sig.shape print 'y_sig.shape: ', y_sig.shape print 'X_bkg.shape: ', X_bkg.shape print 'y_bkg.shape: ', y_bkg.shape print 'X.shape: ', X.shape print 'y.shape: ', y.shape # '---------- Prepare Testing ----------' X_sig_test = np.array(df_sig_test) y_sig_test = np.array(X_sig_test.shape[0] * [1]) X_bkg_test = np.array(df_bkg_test) y_bkg_test = np.array(X_bkg_test.shape[0] * [0]) X_test = np.concatenate((X_sig_test, X_bkg_test)) y_test = np.concatenate((y_sig_test, y_bkg_test)) print 'X_sig_test.shape: ', X_sig_test.shape print 'y_sig_test.shape: ', y_sig_test.shape print 'X_bkg_test.shape: ', X_bkg_test.shape print 'y_bkg_test.shape: ', y_bkg_test.shape print 'X_test.shape: ', X_test.shape print 'y_test.shape: ', y_test.shape #C = 10.0 ** np.arange(-10, 10) #for c in C: # print c # '---------- Model ----------' # first way of doing preprocessing #X = preprocessing.scale(X) # second way of doing preprocessing scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) model = LogisticRegression(C=1000, penalty='l1') model.fit(X, y) print '---------- Training/Testing info ----------' print 'Accuracy (training): ', model.score(X, y) print 'Null Error Rate (training): ', y.mean() X_test = scaler.transform(X_test) predicted_test = model.predict(X_test) predicted_test_clever = (predicted_test + y_test).tolist() error_test = float(predicted_test_clever.count(1)) / float(len(predicted_test_clever)) print "Error: ", error_test print "Accuracy (testing): ", metrics.accuracy_score(y_test, predicted_test) print "Recall (testing): ", metrics.recall_score(y_test, predicted_test) print "F1 score (testing): ", metrics.f1_score(y_test, predicted_test) print "ROC area under curve (testing): ", metrics.roc_auc_score(y_test, predicted_test) #'PTS','AST','REB','STL','BLK','FG_PCT','FG3_PCT','FT_PCT','MIN','EFF','WL' user_input = scaler.transform(np.array([10, 1, 2, 0, 2, 0.3, 0.3, 0.3, 10, 5, 1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2,2,2,2,2,2,2,2,1], dtype=float)) #user_input = scaler.transform(np.array([10,1,2], dtype=float)) print 'Score (user input): ', model.decision_function(user_input) result = model.predict_proba(user_input) print 'Probability of 1 (user input): ', result # 3A. Examine the coefficients #print "Coefficients: ", pd.DataFrame(zip(X, np.transpose(model.coef_))) # 3B. Calculating Error #predicted_train = model.predict(X) #print predicted_train #predicted_train_clever = (predicted_train + y).tolist() #error = float(predicted_train_clever.count(1)) / float(len(predicted_train_clever)) #print "Error: ", error_train # 4. Cross-validation #scores = cross_val_score(LogisticRegression(), X , y, 'accuracy', 4) #print "Cross-validation: ", scores #print "Cross-validation mean: ", scores.mean() # '--------- Visualization -----------' Classifier_training_S = model.decision_function(X[y>0.5]).ravel() Classifier_training_B = model.decision_function(X[y<0.5]).ravel() Classifier_testing_S = model.decision_function(X_test[y_test>0.5]).ravel() Classifier_testing_B = model.decision_function(X_test[y_test<0.5]).ravel() (h_test_s, h_test_b) = visualSigBkg("Logistic Regression", Classifier_training_S, Classifier_training_B, Classifier_testing_S, Classifier_testing_B) return (model, X, y, result, model.score(X, y), error_test, h_test_s, h_test_b)
recall = recall_score(y_true=test_data['sentiment'].to_numpy(), y_pred=model.predict(test_matrix)) print "Recall on test data: %s" % recall print model.classes_ # column ordering of output matrix from predict_proba() is the same as output from model.classes_ score_after_sigmoid = pd.DataFrame(model.predict_proba(test_matrix)) threshold_values = np.linspace(0.5, 1, num=100) precision_all = [] recall_all = [] for threshold in threshold_values: prediction = apply_threshold(pd.DataFrame(model.predict_proba(test_matrix)[:,1]), threshold) precision_all.append(precision_score(y_true=test_data['sentiment'].to_numpy(), y_pred = prediction.as_matrix()[:,0])) recall_all.append(recall_score(y_true=test_data['sentiment'].to_numpy(), y_pred = prediction.as_matrix()[:,0])) prediction_98 = apply_threshold(pd.DataFrame(model.predict_proba(test_matrix)[:,1]), 0.98) print_confusion_matrix(test_data['sentiment'].to_numpy(), prediction_98.as_matrix()[:,0], model) baby_reviews = test_data[test_data['name'].apply(lambda x: 'baby' in x.lower())] baby_matrix = vectorizer.transform(baby_reviews['review_clean']) probabilities = model.decision_function(baby_matrix)
print(lr.predict(train_bream_smelt[:5])) print(lr.predict_proba(train_bream_smelt[:5])) print(lr.classes_) # ['Bream', 'Smelt'] Bream : 음성클래스(0) Smelt:양성클래스(1) ''' 로지스틱 회귀가 학습한 계수(가중치)를 확인해보자 ''' print('계수: ', lr.coef_ ) # [[-0.4037798 -0.57620209 -0.66280298 -1.01290277 -0.73168947]] print('절편: ', lr.intercept_) # [-2.16155132] ''' 로지스틱 회귀가 학습한 방정식 -0.404 * weight -0.576 * length -0.633 * diagonal -1.013 * height -0.732 * width -2.162 ==> z lr의 decision_function()함수를 사용하면 양성클래스(빙어)에 대한 z 값을 계산한다 ''' z = lr.decision_function(train_bream_smelt[:5]) print('z값: ', z) ''' 이 z값을 시그모이드 함수에 통과시키면 확률을 얻을 수 있다 ''' s = 1 / (1 + np.exp(-z)) print(s) ''' 파이썬 사이피이 라이브러리에 시그모이드 함수가 있음 => expit() ''' from scipy.special import expit print(expit(z)) ''' 로지스틱 회귀로 다중 분류하기 (7개의 생선을 분류) LogisticRegression : L2규제를 함 C 매게변수(기본값:1)를 이용해서 규제함 ==> C값이 작을수록 규제강도기 높아짐
from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, f1_score, auc digits = datasets.load_digits() X = digits.data y = digits.target.copy() y[digits.target == 9] = 1 y[digits.target != 9] = 0 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) log_reg = LogisticRegression() log_reg.fit(X_train, y_train) y_predict = log_reg.predict(X_test) print("old f1_score =", f1_score(y_test, y_predict)) decision_score = log_reg.decision_function(X_test) # 输入sigmod中的分数值 precisions, recalls, thresholds = precision_recall_curve(y_test, decision_score) for i in range(thresholds.shape[0]): if precisions[i] == recalls[i]: y_predict = np.array(decision_score > thresholds[i], dtype=int) print("thresholds =", thresholds[i]) print("new f1_score =", f1_score(y_test, y_predict)) plt.figure("Threshold Precision Recall") plt.plot(thresholds, precisions[:-1], label="precision") # last thresholds, precisions=1, recalls=0 plt.plot(thresholds, recalls[:-1], label="recall") plt.legend() plt.figure("PR") plt.plot(recalls, precisions)
alice_list = [] alice_text = (val_df.iloc[0][6]).split() count = 0 alice_mapped_num_list = [] for word in alice_text: alice_mapped_num = create_bitstring_sha224(word) alice_mapped_num_list.append(alice_mapped_num) count = count + 1 print("Alice's total features: ", count) file1_alice = open(DATA61_ROOT + "Input-P0-0", "w") for feature in alice_mapped_num_list: file1_alice.write(feature + " ") file1_alice.close() # Predict the label and get the accuracy using 5-fold cross validation y_predicted = LR_model.predict(X_val) print("Predicted_label: ", y_predicted) predicted_dist = LR_model.decision_function(X_val) print("Predicted_distance: ", predicted_dist) mean_accuracy = cross_val_score(LR_model, X_train, Y_train, scoring='accuracy', cv=10).mean() print("mean_cross_val", mean_accuracy) print("Accuracy: %.2f" % accuracy_score(Y_val, y_predicted))
iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42) gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0) gbrt.fit(X_train, y_train) # decision_function print(gbrt.decision_function(X_test).shape) print(gbrt.decision_function(X_test)[:6, :]) # predict_proba print(gbrt.predict_proba(X_test)[:6, :]) # 通过计算argmax来再现预测结果 print(np.argmax(gbrt.decision_function(X_test), axis=1)) print(np.argmax(gbrt.predict_proba(X_test), axis=1)) print(gbrt.predict(X_test)) #逻辑回归做分类 logreg = LogisticRegression() named_target = iris.target_names[y_train] logreg.fit(X_train, named_target) print(logreg.classes_) print(logreg.predict(X_test)[:10]) argmax_dec_func = np.argmax(logreg.decision_function(X_test), axis=1) print(argmax_dec_func[:10]) print(logreg.classes_[argmax_dec_func][:10]) print( np.all( logreg.classes_[argmax_dec_func][:10] == logreg.predict(X_test)[:10]))
clr = LogisticRegression(solver="lbfgs", penalty='none', random_state=42) clr.fit(X, y) # Output coefficients # In[6]: print("[Intercept] ", X.columns) print(clr.intercept_, clr.coef_) # Prediction and scoring # In[7]: yp = clr.predict(X) y_score = clr.decision_function(X) print(y_score) # ### Performance Metrics # In[8]: tn, fp, fn, tp = confusion_matrix(y, yp).ravel() # In[9]: print("Confusion Matrix:") print("%32s" % "Predicted") print("%17s" % " ", "%8s" % "UNC", "%8s" % "Duke") print("%8s" % "Actual", "%8s" % "UNC", "%8i" % tp, "%8i" % fn) print("%8s" % " ", "%8s" % "Duke", "%8i" % fp, "%8i" % tn)
ax.set_title('calibration curve for '+str(num_hrs)+'hours prediction') print('\n') print("logistic regression:") confusion = confusion_matrix(Y_test, final_predicted_logit) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] print("accuracy of logistic regression ",(TP + TN) / float(TP + TN + FP + FN)) print(confusion_matrix(Y_test, final_predicted_logit)) print(classification_report(Y_test, final_predicted_logit)) sensitivity = TP / float(FN + TP) print("sensitivity of logistic regression ", sensitivity) specificity = TN / float(TN + FP) print("specificity of logistic regression ",specificity) prob_pos = clf_logi.decision_function(X_test) prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) fraction_of_positives, mean_predicted_value = calibration_curve(Y_test, prob_pos, n_bins=10, normalize=True) ax.plot([0, 1], [0, 1], linestyle='--', color='black', label='BASE') ax.plot(fraction_of_positives, mean_predicted_value, 'r--', label='LogisticRegression') print('\n\n') print("ExtraTreesClassifier :") confusion = confusion_matrix(Y_test, final_predicted_xtrees) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] print("accuracy of extratrees ",(TP + TN) / float(TP + TN + FP + FN)) print(confusion_matrix(Y_test, final_predicted_xtrees)) print(classification_report(Y_test, final_predicted_xtrees))
sess = rt.InferenceSession(onx.SerializeToString()) res = sess.run(None, {'float_input': X_test.astype(numpy.float32)}) print("skl", clr.predict_proba(X_test[:1])) print("onnx", res[1][:2]) ################################### # Raw scores and decision_function # ++++++++++++++++++++++++++++++++ # initial_type = [('float_input', FloatTensorType([None, 4]))] options = {id(clr): {'raw_scores': True}} onx2 = convert_sklearn(clr, initial_types=initial_type, options=options, target_opset=12) sess2 = rt.InferenceSession(onx2.SerializeToString()) res2 = sess2.run(None, {'float_input': X_test.astype(numpy.float32)}) print("skl", clr.decision_function(X_test[:1])) print("onnx", res2[1][:2]) ################################# # **Versions used for this example** print("numpy:", numpy.__version__) print("scikit-learn:", sklearn.__version__) print("onnx: ", onnx.__version__) print("onnxruntime: ", rt.__version__) print("skl2onnx: ", skl2onnx.__version__)
def stacking(clf, train_x, train_y, test_x, clf_name, class_num=1): train = np.zeros((train_x.shape[0], class_num)) test = np.zeros((test_x.shape[0], class_num)) test_pre = np.zeros((folds, test_x.shape[0], class_num)) cv_scores = [] for i, (train_index, test_index) in enumerate(kf): tr_x = train_x[train_index] tr_y = train_y[train_index] te_x = train_x[test_index] te_y = train_y[test_index] if clf_name == "lgb": train_matrix = clf.Dataset(tr_x, label=tr_y) test_matrix = clf.Dataset(te_x, label=te_y) params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'metric': 'multi_logloss', 'min_child_weight': 1.5, 'num_leaves': 2**5, 'lambda_l2': 10, 'subsample': 0.7, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5, 'learning_rate': 0.1, 'scale_pos_weight': 20, 'seed': 2018, 'nthread': 16, 'num_class': class_num, 'silent': True, } num_round = 2000 early_stopping_rounds = 100 model = clf.train(params, train_matrix, num_round, valid_sets=test_matrix, early_stopping_rounds=early_stopping_rounds) pre = model.predict(te_x, num_iteration=model.best_iteration).reshape( (te_x.shape[0], class_num)) pred = model.predict(test_x, num_iteration=model.best_iteration).reshape( (test_x.shape[0], class_num)) if clf_name == "lr": model = LogisticRegression(C=4, dual=False) model.fit(tr_x, tr_y) pre = model.predict_proba(te_x) pred = model.predict_proba(test_x) if clf_name == "svm": model = svm.LinearSVC() model.fit(tr_x, tr_y) pre = model.decision_function(te_x) pred = model.decision_function(test_x) train[test_index] = pre test_pre[i, :] = pred cv_scores.append( f1_score(te_y, np.argmax(pre, axis=1), labels=range(0, 19), average='macro')) print("%s now score is:" % clf_name, cv_scores) test[:] = test_pre.mean(axis=0) with open("score_cv.txt", "a") as f: f.write("%s now score is:" % clf_name + str(cv_scores) + "\n") f.write("%s_score_mean:" % clf_name + str(np.mean(cv_scores)) + "\n") return train.reshape(-1, class_num), test.reshape( -1, class_num), np.mean(cv_scores)
recall = recall_score(y_test, y_log_predict) def f1_score(precision, recall): try: return 2 * precision * recall / (precision + recall) except Exception as e: return repr(e) f1 = f1_score(precision, recall) print(precision, recall, f1) # sklearn中的 from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_score from sklearn.metrics import recall_score precision = precision_score(y_test, y_log_predict) recall = recall_score(y_test, y_log_predict) print(precision, recall) dec_score = log_reg.decision_function(x_test) # 决策的分数值 print(np.max(dec_score), np.min(dec_score)) y_predict2 = np.array(dec_score >= 5, dtype='int') # 可视化方式观察 precisions = [] recalls = [] thresholds = np.arange(np.min(dec_score), np.max(dec_score), 0.1) for threshold in thresholds: y_predict = np.array(dec_score >= threshold, dtype='int') precisions.append(precision_score(y_test, y_predict)) recalls.append(recall_score(y_test, y_predict)) plt.plot(thresholds, precisions) plt.plot(thresholds, recalls) plt.show() plt.plot(precisions, recalls) plt.show()
import numpy as np from sklearn.metrics import roc_curve from sklearn.preprocessing import LabelEncoder from keras.utils import np_utils from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model import LogisticRegression import plda import keras import os def adaptive_snorm(scores, scores_enr, scores_test, n_cohort_enr=200, n_cohort_test=200): scores_enr = -np.sort(-scores_enr, axis=1)[:, :n_cohort_enr] scores_test = -np.sort(-scores_test, axis=1)[:, :n_cohort_test] mean_enr = np.tile(np.expand_dims(np.mean(scores_enr, axis=1), axis=1), (1, scores.shape[1])) mean_test = np.tile(np.expand_dims(np.mean(scores_test, axis=1), axis=0), (scores.shape[0], 1)) std_enr = np.tile(np.expand_dims(np.std(scores_enr, axis=1), axis=1), (1, scores.shape[1])) std_test = np.tile(np.expand_dims(np.std(scores_test, axis=1), axis=0), (scores.shape[0], 1)) return 0.5 * ((scores - mean_enr) / std_enr + (scores - mean_test) / std_test) def load_ivector(filename): utt = np.loadtxt(filename,
# creating testing and training set X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33) # train scikit learn model clf = LogisticRegression() clf.fit(X_train, Y_train) print('score Scikit learn: ', clf.score(X_test, Y_test)) # visualize data, uncomment "show()" to run it pos = where(Y == 1) neg = where(Y == 0) scatter(X[pos, 0], X[pos, 1], marker='o', c='b') scatter(X[neg, 0], X[neg, 1], marker='x', c='r') xlabel('Exam 1 score') ylabel('Exam 2 score') legend(['Not Admitted', 'Admitted']) #0.807551618 -0.75983985 #0.531634773 -0.479185022 filename = 'finalized_model.sav' pickle.dump(clf, open(filename, 'wb')) y = clf.predict([[-0.869144323, 0.389309751]]) z = clf.decision_function([[-0.869144323, 0.389309751]]) print(y) print(z) scatter(-0.869144323, 0.389309751, marker='+', c='g') show()
#logistic regression model_lr = LogisticRegression(C=1000) score_list_lr = [] for train_index, test_index in kf.split(features): start_time = time.time() #train model_lr.fit(features[train_index], label[train_index]) #test score_list_lr.append(model_lr.score(features[test_index], label[test_index])) pred_lr = model_lr.predict(features[test_index]) print 'Time spent in each fold:' print time.time() - start_time #plot ROC y_score_lr = model_lr.decision_function(features[test_index]) fpr_l,tpr_l,_ = rc(label[test_index], y_score_lr) fig_21 = plt.figure() lw = 1 plt.plot(fpr_l, tpr_l, color='black', linestyle='-.', lw=lw, label='ROC curve (LogisticR)') plt.plot(fpr_m, tpr_m, color='aqua', linestyle=':', lw=lw, label='ROC curve (Multinomial)') plt.plot(fpr_g, tpr_g, color='cornflowerblue', lw=lw, label='ROC curve (Gaussian)', linestyle=':') plt.plot(fpr_h, tpr_h, color='darkorange', lw=lw, label='ROC curve (Soft SVM)') plt.plot(fpr_s, tpr_s, color='deeppink', lw=lw, label='ROC curve (Hard SVM)') plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0])
y_hat2 = model2.predict(X) from sklearn.metrics import confusion_matrix print(confusion_matrix(y, y_hat1)) print(confusion_matrix(y, y_hat2)) # 두 모형은 분류결과표로 봤을 때는 성능이 같다 from sklearn.metrics import classification_report print(classification_report(y, model1.predict(X))) print(classification_report(y, model2.predict(X))) # ROC curve import matplotlib.pyplot as plt from sklearn.metrics import roc_curve fpr1, tpr1, thresholds1 = roc_curve(y, model1.decision_function(X)) fpr2, tpr2, thresholds1 = roc_curve(y, model2.decision_function(X)) plt.rc('font', family="D2Coding") plt.plot(fpr1, tpr1, 'o-', ms=2, label="Logistic Regression") plt.plot(fpr2, tpr2, 'o-', ms=2, label="Kernel SVM") plt.legend() plt.plot([0, 1], [0, 1], 'k--', label="random guess") plt.xlabel('위양성률(Fall-Out)') plt.ylabel('재현률(Recall)') plt.title('ROC 커브') plt.show()
cnf_matrix = confusion_matrix(Y_test,y_predict) param_grid = [ { 'C':[0.01,0.1,1,10,100], 'penalty':['l2','l1'], 'class_weight':['balanced',None] } ] grid_search = GridSearchCV(lg,param_grid,cv=10,n_jobs=-1) grid_search.fit(X_train,Y_train) print(grid_search.best_estimator_) print(grid_search.best_score_) print(grid_search.best_params_) decision_scores = lg.decision_function(x_test) from sklearn.metrics import precision_recall_curve precisions,recalls,thresholds = precision_recall_curve(y_test,decision_scores) plt.plot(thresholds,precisions[:-1]) plt.plot(thresholds,recalls[:-1]) plt.grid() plt.show() def plot_cnf_matirx(cnf_matrix, description): class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names)
ss = ShuffleSplit(n_splits=1,test_size=0.2, train_size=0.8, random_state=0) train_index, test_index = next(ss.split(X,y)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.fit(X_train, y_train) clf.score(X_test, y_test) clf.C = 1e-3 clf.fit(X_train, y_train) clf.score(X_test, y_test) X_test_value = clf.decision_function(X_test) sorted_va = np.sort(X_test_value) plt.plot(X_test_value) plt.plot([0,120],[0,0], linestyle='--') def sigmoid(x): return 1 / (1 + np.exp(-x)) plt.plot(sigmoid(sorted_va)) plt.plot([0,120], [0.5, 0.5], linestyle='--')
def probabilisticFreeChoicePilotTask_logisticRegression(reward1, target1, trial1, reward3, target3, trial3, stim_trials): ''' Previous rewards and no rewards ''' fc_target_low_block1 = [] fc_target_high_block1 = [] fc_prob_low_block1 = [] prev_reward1_block1 = [] prev_reward2_block1 = [] prev_reward3_block1 = [] prev_reward4_block1 = [] prev_reward5_block1 = [] prev_noreward1_block1 = [] prev_noreward2_block1 = [] prev_noreward3_block1 = [] prev_noreward4_block1 = [] prev_noreward5_block1 = [] prev_stim_block1 = [] fc_target_low_block3 = [] fc_target_high_block3 = [] fc_prob_low_block3 = [] prev_reward1_block3 = [] prev_reward2_block3 = [] prev_reward3_block3 = [] prev_reward4_block3 = [] prev_reward5_block3 = [] prev_noreward1_block3 = [] prev_noreward2_block3 = [] prev_noreward3_block3 = [] prev_noreward4_block3 = [] prev_noreward5_block3 = [] prev_stim1_block3 = [] prev_stim2_block3 = [] prev_stim3_block3 = [] prev_stim4_block3 = [] prev_stim5_block3 = [] for i in range(5,len(trial1)): if trial1[i] == 2: fc_target_low_block1.append(2 -target1[i]) # = 1 if selected low-value, = 0 if selected high-value fc_target_high_block1.append(target1[i] - 1) # = 1 if selected high-value, = 0 if selected low-value prev_reward1_block1.append((2*target1[i-1] - 3)*reward1[i-1]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward2_block1.append((2*target1[i-2] - 3)*reward1[i-2]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward3_block1.append((2*target1[i-3] - 3)*reward1[i-3]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward4_block1.append((2*target1[i-4] - 3)*reward1[i-4]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward5_block1.append((2*target1[i-5] - 3)*reward1[i-5]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward1_block1.append((2*target1[i-1] - 3)*(1 - reward1[i-1])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward2_block1.append((2*target1[i-2] - 3)*(1 - reward1[i-2])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward3_block1.append((2*target1[i-3] - 3)*(1 - reward1[i-3])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward4_block1.append((2*target1[i-4] - 3)*(1 - reward1[i-4])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward5_block1.append((2*target1[i-5] - 3)*(1 - reward1[i-5])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_stim_block1.append(0) num_block3 = len(trial3) for i in range(5,num_block3): if (trial3[i] == 2): fc_target_low_block3.append(2 - target3[i]) # = 1 if selected low-value, = 0 if selected high-value fc_target_high_block3.append(target3[i] - 1) prev_reward1_block3.append((2*target3[i-1] - 3)*reward3[i-1]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward2_block3.append((2*target3[i-2] - 3)*reward3[i-2]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward3_block3.append((2*target3[i-3] - 3)*reward3[i-3]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward4_block3.append((2*target3[i-4] - 3)*reward3[i-4]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_reward5_block3.append((2*target3[i-5] - 3)*reward3[i-5]) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward1_block3.append((2*target3[i-1] - 3)*(1 - reward3[i-1])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward2_block3.append((2*target3[i-2] - 3)*(1 - reward3[i-2])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward3_block3.append((2*target3[i-3] - 3)*(1 - reward3[i-3])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward4_block3.append((2*target3[i-4] - 3)*(1 - reward3[i-4])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_noreward5_block3.append((2*target3[i-5] - 3)*(1 - reward3[i-5])) # = -1 if selected low-value and rewarded, = 1 if selected high-value and rewarded prev_stim1_block3.append(2*stim_trials[i - 1] - 1) # = 1 if stim was delivered and = -1 if stim was not delivered prev_stim2_block3.append(2*stim_trials[i - 2] - 1) prev_stim3_block3.append(2*stim_trials[i - 3] - 1) prev_stim4_block3.append(2*stim_trials[i - 4] - 1) prev_stim5_block3.append(2*stim_trials[i - 5] - 1) ''' Turn everything into an array ''' fc_target_low_block1 = np.array(fc_target_low_block1) fc_target_high_block1 = np.array(fc_target_high_block1) prev_reward1_block1 = np.array(prev_reward1_block1) prev_reward2_block1 = np.array(prev_reward2_block1) prev_reward3_block1 = np.array(prev_reward3_block1) prev_reward4_block1 = np.array(prev_reward4_block1) prev_reward5_block1 = np.array(prev_reward5_block1) prev_noreward1_block1 = np.array(prev_noreward1_block1) prev_noreward2_block1 = np.array(prev_noreward2_block1) prev_noreward3_block1 = np.array(prev_noreward3_block1) prev_noreward4_block1 = np.array(prev_noreward4_block1) prev_noreward5_block1 = np.array(prev_noreward5_block1) prev_stim_block1 = np.array(prev_stim_block1) fc_target_low_block3 = np.array(fc_target_low_block3) fc_target_high_block3 = np.array(fc_target_high_block3) prev_reward1_block3 = np.array(prev_reward1_block3) prev_reward2_block3 = np.array(prev_reward2_block3) prev_reward3_block3 = np.array(prev_reward3_block3) prev_reward4_block3 = np.array(prev_reward4_block3) prev_reward5_block3 = np.array(prev_reward5_block3) prev_noreward1_block3 = np.array(prev_noreward1_block3) prev_noreward2_block3 = np.array(prev_noreward2_block3) prev_noreward3_block3 = np.array(prev_noreward3_block3) prev_noreward4_block3 = np.array(prev_noreward4_block3) prev_noreward5_block3 = np.array(prev_noreward5_block3) prev_stim1_block3 = np.array(prev_stim1_block3) prev_stim2_block3 = np.array(prev_stim2_block3) prev_stim3_block3 = np.array(prev_stim3_block3) prev_stim4_block3 = np.array(prev_stim4_block3) prev_stim5_block3 = np.array(prev_stim5_block3) const_logit_block1 = np.ones(fc_target_low_block1.size) const_logit_block3 = np.ones(fc_target_low_block3.size) ''' Oraganize data and regress with GLM ''' x = np.vstack((prev_reward1_block1,prev_reward2_block1,prev_reward3_block1,prev_reward4_block1,prev_reward5_block1, prev_noreward1_block1,prev_noreward2_block1,prev_noreward3_block1,prev_noreward4_block1,prev_noreward5_block1)) x = np.transpose(x) x = sm.add_constant(x,prepend='False') y = np.vstack((prev_reward1_block3,prev_reward2_block3,prev_reward3_block3,prev_reward4_block3,prev_reward5_block3, prev_noreward1_block3,prev_noreward2_block3,prev_noreward3_block3,prev_noreward4_block3,prev_noreward5_block3, prev_stim1_block3, prev_stim2_block3, prev_stim3_block3, prev_stim4_block3, prev_stim5_block3)) y = np.transpose(y) y = sm.add_constant(y,prepend='False') model_glm_block1 = sm.GLM(fc_target_low_block1,x,family = sm.families.Binomial()) model_glm_block3 = sm.GLM(fc_target_low_block3,y,family = sm.families.Binomial()) fit_glm_block1 = model_glm_block1.fit() fit_glm_block3 = model_glm_block3.fit() print fit_glm_block1.predict() ''' Oraganize data and regress with LogisticRegression ''' d_block1 = {'target_selection': fc_target_high_block1, 'prev_reward1': prev_reward1_block1, 'prev_reward2': prev_reward2_block1, 'prev_reward3': prev_reward3_block1, 'prev_reward4': prev_reward4_block1, 'prev_reward5': prev_reward5_block1, 'prev_noreward1': prev_noreward1_block1, 'prev_noreward2': prev_noreward2_block1, 'prev_noreward3': prev_noreward3_block1, 'prev_noreward4': prev_noreward4_block1, 'prev_noreward5': prev_noreward5_block1} df_block1 = pd.DataFrame(d_block1) y_block1, X_block1 = dmatrices('target_selection ~ prev_reward1 + prev_reward2 + prev_reward3 + \ prev_reward4 + prev_reward5 + prev_noreward1 + prev_noreward2 + \ prev_noreward3 + prev_noreward4 + prev_noreward5', df_block1, return_type = "dataframe") #print X_block1.columns # flatten y_block1 into 1-D array y_block1 = np.ravel(y_block1) d_block3 = {'target_selection': fc_target_high_block3, 'prev_reward1': prev_reward1_block3, 'prev_reward2': prev_reward2_block3, 'prev_reward3': prev_reward3_block3, 'prev_reward4': prev_reward4_block3, 'prev_reward5': prev_reward5_block3, 'prev_noreward1': prev_noreward1_block3, 'prev_noreward2': prev_noreward2_block3, 'prev_noreward3': prev_noreward3_block3, 'prev_noreward4': prev_noreward4_block3, 'prev_noreward5': prev_noreward5_block3, 'prev_stim1': prev_stim1_block3, 'prev_stim2': prev_stim2_block3, 'prev_stim3': prev_stim3_block3, 'prev_stim4': prev_stim4_block3, 'prev_stim5': prev_stim5_block3} df_block3 = pd.DataFrame(d_block3) y_block3, X_block3 = dmatrices('target_selection ~ prev_reward1 + prev_reward2 + prev_reward3 + \ prev_reward4 + prev_reward5 + prev_noreward1 + prev_noreward2 + \ prev_noreward3 + prev_noreward4 + prev_noreward5 + prev_stim1 + \ prev_stim2 + prev_stim3 + prev_stim4 + prev_stim5', df_block3, return_type = "dataframe") # flatten y_block3 into 1-D array y_block3 = np.ravel(y_block3) # Split data into train and test sets X_block1_train, X_block1_test, y_block1_train, y_block1_test = train_test_split(X_block1,y_block1,test_size = 0.3, random_state = 0) X_block3_train, X_block3_test, y_block3_train, y_block3_test = train_test_split(X_block3,y_block3,test_size = 0.3, random_state = 0) # instantiate a logistic regression model, and fit with X and y training sets model_block1 = LogisticRegression() model_block3 = LogisticRegression() model_block1 = model_block1.fit(X_block1_train, y_block1_train) model_block3 = model_block3.fit(X_block3_train, y_block3_train) y_block1_score = model_block1.decision_function(X_block1_test) y_block3_score = model_block3.decision_function(X_block3_test) y_block1_nullscore = np.ones(len(y_block1_score)) y_block3_nullscore = np.ones(len(y_block3_score)) # Compute ROC curve and ROC area for each class (low value and high value) ''' fpr_block1 = dict() tpr_block1 = dict() fpr_block3 = dict() tpr_block3 = dict() roc_auc_block1 = dict() roc_auc_block3 = dict() ''' fpr_block1, tpr_block1, thresholds_block1 = roc_curve(y_block1_test,y_block1_score) roc_auc_block1 = auc(fpr_block1,tpr_block1) fpr_block3, tpr_block3, thresholds_block3 = roc_curve(y_block3_test,y_block3_score) roc_auc_block3 = auc(fpr_block3,tpr_block3) fpr_null_block1, tpr_null_block1, thresholds_null_block1 = roc_curve(y_block1_test,y_block1_nullscore) roc_nullauc_block1 = auc(fpr_null_block1,tpr_null_block1) fpr_null_block3, tpr_null_block3, thresholds_null_block3 = roc_curve(y_block3_test,y_block3_nullscore) roc_nullauc_block3 = auc(fpr_null_block3,tpr_null_block3) plt.figure() plt.plot(fpr_block1,tpr_block1,'r',label="Block 1 (area = %0.2f)" % roc_auc_block1) plt.plot(fpr_null_block1,tpr_null_block1,'r--',label="Block 1 - Null (area = %0.2f)" % roc_nullauc_block1) plt.plot(fpr_block3,tpr_block3,'m',label="Block 3 (area = %0.2f)" % roc_auc_block3) plt.plot(fpr_null_block3,tpr_null_block3,'m--',label="Block 3 - Null (area = %0.2f)" % roc_nullauc_block3) plt.plot([0,1],[0,1],'b--') #plt.plot(fpr_block1[1],tpr_block1[1],label="Class HV (area = %0.2f)" % roc_auc_block1[1]) plt.xlim([0.0,1.0]) plt.ylim([0.0,1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc=4) plt.show() # Predict class labels for the test set predicted_block1 = model_block1.predict(X_block1_test) probs_block1 = model_block1.predict_proba(X_block1_test) predicted_block3 = model_block3.predict(X_block3_test) probs_block3 = model_block3.predict_proba(X_block3_test) # Generate evaluation metrics print "Block 1 accuracy:", metrics.accuracy_score(y_block1_test, predicted_block1) print "Block 1 ROC area under curve:", metrics.roc_auc_score(y_block1_test, probs_block1[:,1]) print 'Null accuracy rate for Block 1:',np.max([y_block1_test.mean(),1 - y_block1_test.mean()]) print "Block 3 accuracy:", metrics.accuracy_score(y_block3_test, predicted_block3) print "Block 3 ROC area under curve:", metrics.roc_auc_score(y_block3_test, probs_block3[:,1]) print 'Null accuracy rate for Block 3:',np.max([y_block3_test.mean(),1 - y_block3_test.mean()]) # Model evaluation using 10-fold cross-validation scores_block1 = cross_val_score(LogisticRegression(),X_block1,y_block1,scoring='accuracy',cv=10) scores_block3 = cross_val_score(LogisticRegression(),X_block3,y_block3,scoring='accuracy',cv=10) print "Block 1 CV scores:", scores_block1 print "Block 1 Avg CV score:", scores_block1.mean() print "Block 3 CV scores:", scores_block3 print "Block 3 Avg CV score:", scores_block3.mean() ''' # check the accuracy on the training set print 'Model accuracy for Block1:',model_block1.score(X_block1, y_block1) print 'Null accuracy rate for Block1:',np.max([y_block1.mean(),1 - y_block1.mean()]) print 'Model accuracy for Block3:',model_block3.score(X_block3, y_block3) print 'Null accuracy rate for Block3:',np.max([y_block3.mean(),1 - y_block3.mean()]) ''' # examine the coefficients print pd.DataFrame(zip(X_block1.columns, np.transpose(model_block1.coef_))) print pd.DataFrame(zip(X_block3.columns, np.transpose(model_block3.coef_))) #return fit_glm_block1, fit_glm_block3 return model_block1, model_block3, predicted_block1, predicted_block3
# 特征选择 target = np.array(data.Class.tolist()) feathers = data.drop(['Time', 'Class'], axis=1).values # 划分训练集和测试集 train_x, test_x, train_y, test_y = train_test_split(feathers, target, test_size=0.1, random_state=33) # 逻辑回归模型训练 lg = LogisticRegression() lg.fit(train_x, train_y) predict_y = lg.predict(test_x) # 模型评估 # 预测样本的置信分数 score_y = lg.decision_function(test_x) # 计算混淆矩阵并显示 cm = confusion_matrix(test_y, predict_y) plt.figure() plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.title('Confusion matrix') plt.colorbar() trick_marks = [0, 1] plt.xticks(trick_marks, rotation = 0) plt.yticks(trick_marks) thresh = cm.max() / 2 for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment = 'center', color = 'white' if cm[i, j] > thresh else 'black') plt.tight_layout()
class MITLLStringMatcher(BaseEstimator,ClassifierMixin): """ MIT-LL String Matcher as Sklearn Estimator: String Matching Techniques: - Levenshtein Distance - Jaro-Winkler - Soft TF-IDF """ # Logging LOG_LEVEL = logging.INFO logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S') logger = logging.getLogger(__name__) def __init__(self,algorithm='jw', stf_thresh=0.6, idf_model=None, text_normalizer = None): """ Initialize dict containing hyperparameters """ self.algorithm = algorithm self.stf_thresh = stf_thresh self.idf_model = idf_model self.text_normalizer = text_normalizer # # Basic String Matching Functions # def levenshtein_similarity(self,s,t): """ Levenshtein Similarity """ Ns = len(s); Nt = len(t); lev_sim = 1.0 - (jellyfish.levenshtein_distance(s,t))/float(max(Ns,Nt)) return lev_sim def jaro_winkler_similarity(self,s,t): """ Jaro-Winkler Similarity """ jw_sim = jellyfish.jaro_winkler(s,t) return jw_sim def soft_tfidf_similarity(self,s,t): """ Soft TFIDF Similarity: This similarity measure is only meaningful when you have multi-word strings. For single words, this measure will return 0.0 """ stf = self.hyparams['matcher'] #soft tf-idf object tfidf_sim = 0.5*(stf.score(s,t)+stf.score(t,s)) return tfidf_sim # # Utitlity Functions # def init_hyparams(self): """ Initialize hyper-parameters dict """ self.hyparams = dict() self.hyparams['match_fcn'] = None self.hyparams['algo'] = self.algorithm self.hyparams['txt_normer'] = self.text_normalizer if self.algorithm == 'lev': #levenshtein self.hyparams['match_fcn'] = self.levenshtein_similarity elif self.algorithm== 'jw': #jaro-winkler self.hyparams['match_fcn'] = self.jaro_winkler_similarity elif self.algorithm== 'stf': #softtfidf self.hyparams['match_fcn'] = self.soft_tfidf_similarity self.hyparams['stf_thresh'] = self.stf_thresh self.hyparams['idf_model'] = self.idf_model def validate_hyparams(self): """ Basic hyperparameter input validation""" if self.hyparams['algo'] not in set(['lev','jw','stf']): raise ValueError("Value of algorithm has to be either 'lev','jw' or 'stf'. Got {0}".format(self.hyparams['algo'])) if self.hyparams['txt_normer'] not in set(['latin',None]): raise ValueError("The only value of txt_normer currently support is 'latin' (or None)") if self.hyparams['algo'] == 'stf': if (self.hyparams['stf_thresh'] < 0) | (self.hyparams['stf_thresh'] > 1): raise ValueError("Value of soft tf-idf's internal jaro-winkler threshold", \ "must be [0,1].") if self.hyparams['idf_model']: if set(self.hyparams['idf_model'].keys()) != set(['idf','corpus_vocab','oov_idf_val']): raise ValueError("IDF model provided must contain only the following keys: ", \ "'idf', 'corpus_vocab', and 'oov_idf_val'.") if (not isinstance(self.hyparams['idf_model']['idf'],np.ndarray)) or \ (self.hyparams['idf_model']['idf'].dtype.type is not np.float64): raise ValueError("idf_model['idf'] must be an np.ndarray of dtype np.float64") if not isinstance(self.hyparams['idf_model']['corpus_vocab'],dict): raise ValueError("idf_model['corpus_vocab'] must be a dict.") if not isinstance(self.hyparams['idf_model']['oov_idf_val'],float): raise ValueError("idf_model['oov_idf_val'] must be a float.") def init_algorithm(self): """ Validate hyperparameter inputs, init matcher object if neccessary""" self.validate_hyparams() # Initialize Soft TF-IDF matcher if needed if self.hyparams['algo'] == 'stf': #softtfidf self.hyparams['matcher'] = Softtfidf(self.hyparams['stf_thresh'],self.hyparams['idf_model']) if self.hyparams['txt_normer'] == 'latin': self.normalizer = normutils.latin_normalization.MITLLLatinNormalizer() else: self.normalizer = normutils.text_normalization.MITLLTextNormalizer() #generic normer def get_raw_similarities(self, X, y=None): """ Convert input to raw similarities """ #make sure we have [0,1] class encoding in y if y: if set(y) != set((0,1)): raise ValueError("y expects class labels to be from {0,1}") similarities = list() for i in xrange(len(X)): pair = X[i] s = unicode(self.normalizer.normalize(pair[0]),'utf-8') t = unicode(self.normalizer.normalize(pair[1]),'utf-8') if (len(s) > 0) and (len(t) > 0): sim = self.hyparams['match_fcn'](s,t) similarities.append(sim) else: similarities.append(0.0) if y: y[i] = -1 #set y-value of non-conforming pair to -1 sims_array = np.asarray(similarities).reshape(-1,1) if y: return (sims_array,y) else: return sims_array def save_model(self,fnameout): """ Save model parameters out after fitting. """ if self.lr_: model_out = dict() model_out['algo'] = self.hyparams['algo'] model_out['txt_normer'] = self.hyparams['txt_normer'] model_out['calibration'] = self.lr_ if self.hyparams['algo'] == 'stf': model_out['stf_thresh'] = self.hyparams['stf_thresh'] model_out['idf_model'] = self.hyparams['idf_model'] pickle.dump(model_out,open(fnameout,"wb")) return self else: raise ValueError("save_model failed: No model has yet been fit or loaded.") def load_model(self,fnamein): """ Load model parameters. """ model_in = pickle.load(open(fnamein,'rb')) # will throw I/O error if file not found self.init_hyparams() #initialize hyper-parameter dict self.hyparams['algo'] = model_in['algo'] self.hyparams['txt_normer'] = model_in['txt_normer'] self.lr_ = model_in['calibration'] if model_in['algo'] == 'stf': self.hyparams['stf_thresh'] = model_in['stf_thresh'] self.hyparams['idf_model'] = model_in['idf_model'] self.init_algorithm() #validate hyparams (we assume object not fit when load_model called) return self # # Learning # def fit(self,X,y): """ Fit string matching models to training data Assuming X is list of tuples: (('s1',t1'),...,('sN',tN')) """ y = y[:] #shallow copy y, b/c in-place operations to follow # Initialize hyper-parameter dict then algorithm self.init_hyparams(); self.init_algorithm() # Get string match scores (s,y) = self.get_raw_similarities(X,y) # Get rid of any non-conforming pairs data = zip(s,y) for pair in reversed(data): #iterate backwards to remove items from "data" #so as not to mess up internal indexing of for-loop if pair[1] == -1: data.remove(pair) (s,y) = zip(*data) # Do Platt Scaling self.lr_ = LR(penalty='l1',class_weight='balanced') self.lr_.fit(s,y) return self # # Inference # def decision_function(self,X): """ Take input data, turn into decision """ s = self.get_raw_similarities(X) return self.lr_.decision_function(s) def predict(self,X): """ Class predictions """ s = self.get_raw_similarities(X) return self.lr_.predict(s) def predict_proba(self,X): """ Posterior match probabilities (need this for log-loss for CV """ s = self.get_raw_similarities(X) return self.lr_.predict_proba(s) # # Evaluate # def score(self,X,y,sample_weight=None): """ Score matcher """ return roc_auc_score(y,self.predict(X),sample_weight=sample_weight)
columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ) df["target"] = dataset.target print(df.head()) print(df.tail()) print(df.info()) print(df.shape) X = df.iloc[: , :-1] y = df.iloc[: , -1] X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=160) model = LogisticRegression() model.fit(X_train, y_train) fpr0, tpr0, thresholds0 = roc_curve(y_test, model.decision_function(X_test)[:, 0], pos_label=0) fpr1, tpr1, thresholds1 = roc_curve(y_test, model.decision_function(X_test)[:, 1], pos_label=1) fpr2, tpr2, thresholds2 = roc_curve(y_test, model.decision_function(X_test)[:, 2], pos_label=2) print(fpr0, tpr0, thresholds0) plt.plot(fpr0, tpr0, "r-", label="class 0 ") plt.plot(fpr1, tpr1, "g-", label="class 1") plt.plot(fpr2, tpr2, "b-", label="class 2") plt.plot([0, 1], [0, 1], 'k--', label="random guess") plt.xlim(-0.05, 1.0) plt.ylim(0, 1.05) plt.xlabel('False Positive Rate (Fall-Out)') plt.ylabel('True Positive Rate (Recall)') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right")
fpr_c, tpr_c, th_c = roc_curve(test["los"], test_cart_prob[::,1]) fpr_s, tpr_s, th_s = roc_curve(test["los"], test_ad_prob[::,1]) fpr_n, tpr_n, th_n = roc_curve(test["los"], test_naive_prob[::,1]) plt.plot(fpr_l, tpr_l) plt.plot(fpr_c, tpr_c) plt.plot(fpr_s, tpr_s) plt.plot(fpr_n, tpr_n) plt.plot(sorted(np.random.uniform(0, 1, len(test['los']))),sorted(np.random.uniform(0, 1, len(test['los']))),'r--',color='k') plt.legend(['LogisticRegress', 'CART', 'AdBoosting', 'NaiveBayes','Randomness'], loc='lower right') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show() #precision-recall #log precision_l, recall_l, thresholds_l = precision_recall_curve(test["los"], log.decision_function(test_variables)) pl.plot(recall_l, precision_l) pl.xlabel("precision") pl.ylabel("recall") pl.title("LogisticRegression") pl.show() #cart precision_c, recall_c, thresholds_c = precision_recall_curve(test["los"], test_cart_prob[::,1]) pl.plot(recall_c, precision_c) pl.xlabel("precision") pl.ylabel("recall") pl.title("CART") pl.show() #ad precision_ad, recall_ad, thresholds_ad = precision_recall_curve(test["los"], ad.decision_function(test_variables)) pl.plot(recall_ad, precision_ad)
class TagSelector: def __init__(self, trainfeatures, tags): words, samples, targets = [], [], [] positive = 0 for word, sample, target in self.parse_features(trainfeatures, True): words.append(word) samples.append(sample) targets.append(target) # balanced set #if target == 1: #words.append(word) #samples.append(sample) #targets.append(target) #positive += 1 #elif positive > 0: #words.append(word) #samples.append(sample) #targets.append(target) #positive = max(positive-1, 0) samples, targets = np.array(samples), np.array(targets) #self.scaler = StandardScaler() #samples = self.scaler.fit_transform(samples) start_t = time() self.logit_fit = LogisticRegression().fit(samples, targets) end_t = time() print("Logit fitted in (%f s)" % (end_t-start_t)) start_t = time() self.svm_fit = svm.LinearSVC().fit(samples, targets) end_t = time() print("SVM fitted in (%f s)" % (end_t-start_t)) start_t = time() self.nb_fit = GaussianNB().fit(samples, targets) end_t = time() print("NB fitted in (%f s)" % (end_t-start_t)) print("Training set size: %d" % len(samples)) def parse_features(self, featuresfile, is_training): with open(featuresfile, 'r') as f: for line in f: vec = line.split("\t") word = vec[0] sample = [float(vec[i]) for i in range(1, len(vec)-1)] if is_training: target = int(vec[-1]) yield word, sample, target else: id = int(vec[-1]) yield word, sample, id def next_sample(self, featurefile): cur_id = 0 samples, words = [], [] for word, vec, id in self.parse_features(featurefile, False): if id != cur_id: if cur_id != 0: yield samples, words, cur_id cur_id = id samples, words = [vec], [word] else: samples.append(vec) words.append(word) if cur_id != 0: yield samples, words, cur_id def rank(self, samples, words): samples, preds = np.array(samples), [] # no need to rank if 3 or less candidates if len(words) > 3: start_t = time() #samples = self.scaler.transform(samples) preds_lgt = self.logit_fit.decision_function(samples) preds_svm = self.svm_fit.decision_function(samples) #preds_nb = self.nb_fit.predict_proba(samples) #preds_nb = np.array([x[1] for x in preds_nb]) preds = 0.7*preds_svm + 0.3*preds_lgt # + 0.05*preds_nb end_t = time() #print("Predictions made in (%f s)" % (end_t-start_t)) #results = zip(words, preds_nb) #return [w for w, _ in sorted(results, key = lambda x: x[1][1], reverse=True)] results = zip(words, preds) return [w for w, _ in sorted(results, key = lambda x: x[1], reverse=True)] return words
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # Use this token pattern to keep single-letter words # First, learn vocabulary from the training data and assign columns to words # Then convert the training data into a sparse matrix train_matrix = vectorizer.fit_transform(train_data['review_clean']) # Second, convert the test data into a sparse matrix, using the same word-column mapping test_matrix = vectorizer.transform(test_data['review_clean']) print "Start logisitc regression. Don't worry it will take some time" #Train a sentiment classifier with logistic regression from sklearn.linear_model import LogisticRegression majority_model = LogisticRegression() majority_model.fit(train_matrix, train_data['sentiment']) #Find the most positive (and negative) review scores_test = majority_model.decision_function(test_matrix) pred_test = pred(scores_test) #Compute accuracy of the classifier print "Compute accuracy of the classifier. It will take a while so take a break" accuracy_test = compute_accuracy(test_data['sentiment'], pred_test) print("The accuracy on the test data is: %.3f" %accuracy_test) #Answer 0.843
X_test_unlabeled_pool = X_test[:1992, :] X_test_test = X_test[1992:, :] y_test_unlabeled_pool = y_test[:1992, -1] y_test_test = y_test[1992:, -1] acc = [] train_acc = [] dim = [] for k in range(0, 10, 1): clf = LogisticRegression() print 'Size of X: ', len(X_train), X_train.shape, type(X_train) clf.fit(X_train, y_train[:, -1]) preds = clf.decision_function(X_test_unlabeled_pool) values = [] positions = [] for i in range(0, len(X_test_unlabeled_pool), 1): values.append(abs(preds[i])) positions.append(i) for i in range(10): pos = np.array(values).argmin() # print np.array(values).min() X_train_new = np.zeros(((X_train.shape[0] + 1), X_train.shape[1])) y_train_new = np.zeros(((y_train[:, -1].shape[0] + 1), 1)) X_train_new[:X_train.shape[0]] = X_train X_train_new[X_train.shape[0]:] = X_test_unlabeled_pool[pos, :]
show_dataset(X, Y) # Split dataset X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25) # Create and train Gaussian Naive Bayes classifier gnb = GaussianNB() gnb.fit(X_train, Y_train) # Create and train a Logistic regressor (for comparison) lr = LogisticRegression() lr.fit(X_train, Y_train) # Compute ROC Curve Y_gnb_score = gnb.predict_proba(X_test) Y_lr_score = lr.decision_function(X_test) fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(Y_test, Y_gnb_score[:, 1]) fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_lr_score) # Plot ROC Curve plt.figure(figsize=(30, 25)) plt.plot(fpr_gnb, tpr_gnb, color='red', label='Naive Bayes (AUC: %.2f)' % auc(fpr_gnb, tpr_gnb)) plt.plot(fpr_lr, tpr_lr, color='green', label='Logistic Regression (AUC: %.2f)' % auc(fpr_lr, tpr_lr))
cm_logistic # #### 14. What does the Confusion Matrix tell us? '''Our model has a higher proportion of false negatives to positives than false positives to negatives. The model handles negatives better. This actually is disturbing because in a disaster, all else being equal, we would probably want to err on overestimating the survivors (false positives) and minimizing false negatives. The headlines would probably be more oriented towards government waste rather than government tragedy (deaths that could have been prevented due to being prepared for a number of survivors), which is preferable.''' # #### 15. Plot the ROC curve Y_score_lr = lr.decision_function(X_test) FPR_logistic = dict() #false positive rate. X-axis for ROC Curve TPR_logistic = dict() #true positive rate. Y-axis for ROC curve ROC_AUC = dict() FPR_logistic[1], TPR_logistic[1], thresholds_logistic = metrics.roc_curve(y_test, Y_score_lr) ROC_AUC[1] = metrics.auc(FPR_logistic[1], TPR_logistic[1]) plt.figure() plt.plot(FPR_logistic[1], TPR_logistic[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1]) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')