class QuadraticDiscriminantAnalysiscls(object): """docstring for ClassName""" def __init__(self): self.qda_cls = QuadraticDiscriminantAnalysis() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.qda_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.qda_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.qda_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def create_symbol_forecast_model(self): # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series( self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5 ) # Use the prior two days of returns as predictor # values, with direction as the response x = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Create training and test sets, each of them is series start_test = self.model_start_test_date x_train = x[x.index < start_test] x_test = x[x.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] model = QuadraticDiscriminantAnalysis() model.fit(x_train, y_train) # return nd array pred_test = model.predict(x_test) print("Error Rate is {0}".format((y_test != pred_test).sum() * 1. / len(y_test))) return model
class SNPForecastingStrategy(Strategy): """ Requires: symbol - A stock symbol on which to form a strategy on. bars - A DataFrame of bars for the above symbol.""" def __init__(self, symbol, bars): self.symbol = symbol self.bars = bars self.create_periods() self.fit_model() def create_periods(self): """Create training/test periods.""" self.start_train = datetime.datetime(2001,1,10) self.start_test = datetime.datetime(2005,1,1) self.end_period = datetime.datetime(2005,12,31) def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QuadraticDiscriminantAnalysis() self.model.fit(X_train, y_train) def generate_signals(self): """Returns the DataFrame of symbols containing the signals to go long, short or hold (1, -1 or 0).""" signals = pd.DataFrame(index=self.bars.index) signals['signal'] = 0.0 # Predict the subsequent period with the QDA model signals['signal'] = self.model.predict(self.predictors) # Remove the first five signal entries to eliminate # NaN issues with the signals DataFrame signals['signal'][0:5] = 0.0 signals['positions'] = signals['signal'].diff() return signals
def doQDA(x,digits,s): myLDA = LDA() myLDA.fit(x.PCA[:,:s],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:s,:]) labels = myLDA.predict(newtest) errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels) return errors
def confusion(digits): myLDA = LDA() x = center_matrix_SVD(digits.train_Images) myLDA.fit(x.PCA[:,:50],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:50,:]) labels = myLDA.predict(newtest) import sklearn.metrics as f print(f.confusion_matrix(digits.test_Labels,labels))
def test_qda_priors(): clf = QuadraticDiscriminantAnalysis() y_pred = clf.fit(X6, y6).predict(X6) n_pos = np.sum(y_pred == 2) neg = 1e-10 clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg])) y_pred = clf.fit(X6, y6).predict(X6) n_pos2 = np.sum(y_pred == 2) assert_greater(n_pos2, n_pos)
def QD(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelQD=QuadraticDiscriminantAnalysis() modelQD.fit(train_desc,np.array(train_labels)) joblib.dump((modelQD, img_classes, stdSlr), pth+"/qd-bof.pkl", compress=3) test(pth, "qd-")
def get_QDA(Xtrain, Ytrain, Xtest = None , Ytest = None, verbose = 0): qda = QDA() qda.fit(Xtrain,Ytrain) scores = np.empty((2)) if (verbose == 1): scores[0] = qda.score(Xtrain,Ytrain) print('QDA, train: {0:.02f}% '.format(scores[0]*100)) if (type(Xtest) != type(None)): scores[1] = qda.score(Xtest,Ytest) print('QDA, test: {0:.02f}% '.format(scores[1]*100)) return qda
def crossValidate(attributes, outcomes, foldCount, ownFunction=True): presList =[]; recallList = [] accrList = []; fMeasList = [] aucList = [] testingEstimate = [] otcmVal = list(set(outcomes)) params = {}; featLen = 4; attrFolds = getFolds(attributes,foldCount) otcmFolds = getFolds(outcomes,foldCount) testDataList = copy.copy(attrFolds) testOtcmList = copy.copy(otcmFolds) for itr in range(foldCount): trainDataList = [] trainOtcmList = [] for intitr in range (foldCount): if intitr != itr: trainDataList.append(attrFolds[intitr]) trainOtcmList.append(otcmFolds[intitr]) trainDataArr = np.array(trainDataList).reshape(-1,featLen) trainOtcmArr = np.array(trainOtcmList).reshape(-1) testDataArr = np.array(testDataList[itr]).reshape(-1,featLen) testOtcmArr = np.array(testOtcmList[itr]).reshape(-1) if ownFunction: params = getParams(trainDataArr,trainOtcmArr,otcmVal,featLen) testingEstimate = gdaNDEstimate(testDataArr,params,otcmVal) else: #clf = LinearDiscriminantAnalysis() clf = QuadraticDiscriminantAnalysis() clf.fit(trainDataArr,trainOtcmArr) trainingEstimate = clf.predict(trainDataArr) testingEstimate = clf.predict(testDataArr) if itr == 0 and len(otcmVal)==2: addTitle = "Own" if ownFunction else "Inbuilt" metric = getMetrics(testOtcmArr,testingEstimate,otcmVal,showPlot=True,title="GDA2D Versicolor,Virginica - %s"%addTitle) else: metric = getMetrics(testOtcmArr,testingEstimate,otcmVal) accrList.append(metric[0]) presList.append(metric[1]) recallList.append(metric[2]) fMeasList.append(metric[3]) aucList.append(metric[4]) return accrList, presList, recallList, fMeasList, aucList
def train_DA(self, X, y, lda_comp, qda_reg): ''' Input: qda_reg - reg_param lda_comp - n_components X - data matrix (train_num, feat_num) y - target labels matrix (train_num, label_num) Output: best_clf - best classifier trained (QDA/LDA) best_score - CV score of best classifier Find best DA classifier. ''' n_samples, n_feat = X.shape cv_folds = 10 kf = KFold(n_samples, cv_folds, shuffle=False) lda = LinearDiscriminantAnalysis(n_components = lda_comp) qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg) score_total_lda = 0 #running total of metric score over all cv runs score_total_qda = 0 #running total of metric score over all cv runs for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] lda.fit(X_train, y_train) cv_pred_lda = lda.predict(X_test) score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_lda += score_lda qda.fit(X_train,y_train) cv_pred_qda = qda.predict(X_test) score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_qda += score_qda score_lda = score_total_lda/cv_folds score_qda = score_total_qda/cv_folds # We keep the best one if(score_qda > score_lda): qda.fit(X,y) return qda, score_qda else: lda.fit(X,y) return lda, score_lda
class QuadraticDiscriminantAnalysisPredictor(PredictorBase): ''' Quadratic Discriminant Analysis ''' def __init__(self): self.clf = QuadraticDiscriminantAnalysis() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def get_k_best_k(self): return 4
def test_qda_regularization(): # the default is reg_param=0. and will cause issues # when there is a constant variable clf = QuadraticDiscriminantAnalysis() with ignore_warnings(): y_pred = clf.fit(X2, y6).predict(X2) assert np.any(y_pred != y6) # adding a little regularization fixes the problem clf = QuadraticDiscriminantAnalysis(reg_param=0.01) with ignore_warnings(): clf.fit(X2, y6) y_pred = clf.predict(X2) assert_array_equal(y_pred, y6) # Case n_samples_in_a_class < n_features clf = QuadraticDiscriminantAnalysis(reg_param=0.1) with ignore_warnings(): clf.fit(X5, y5) y_pred5 = clf.predict(X5) assert_array_equal(y_pred5, y5)
def create_symbol_forecast_model(self): # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series( self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5 ) # Use the prior two days of returns as predictor # values, with direction as the response X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Skip days with NaN skip_till_date = self.model_start_date + relativedelta(days=3) X = X[X.index > skip_till_date] y = y[y.index > skip_till_date] logging.debug(snpret[snpret.index > skip_till_date]) model = QDA() model.fit(X, y) return model
def set_up_classifier(self): historic_data = self.get_data() # Key is to identify a trend (use close for now) historic_data['return_5_timeframe'] = np.log(historic_data['Close'] / historic_data['Close'].shift(5)) * 100 historic_data.fillna(0.0001, inplace=True) historic_data['vol_normalised'] = normalise_data(historic_data['Volume']) # Bucket Return def bucket_return(x, col): if 0 < x[col] < 0.02: return 1 if 0.02 < x[col] < 0.1: return 2 if x[col] > 0.1: return 3 if 0 > x[col] > -0.02: return -1 if -0.02 > x[col] > -0.1: return -2 if x[col] < -0.1: return -3 else: return 0 historic_data['Return'] = historic_data.apply(bucket_return, axis=1, args=['return_5_timeframe']) historic_data['Move'] = historic_data['Close'] - historic_data['Open'] # X as predictor values, with Y as the response x = historic_data[["Move"]] y = historic_data["Return"] model = QuadraticDiscriminantAnalysis() model.fit(x, y) return model
def test_qda(): # QDA classification. # This checks that QDA implements fit and predict and returns # correct values for a simple toy dataset. clf = QuadraticDiscriminantAnalysis() y_pred = clf.fit(X6, y6).predict(X6) assert_array_equal(y_pred, y6) # Assure that it works with 1D data y_pred1 = clf.fit(X7, y6).predict(X7) assert_array_equal(y_pred1, y6) # Test probas estimates y_proba_pred1 = clf.predict_proba(X7) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6) y_log_proba_pred1 = clf.predict_log_proba(X7) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8) y_pred3 = clf.fit(X6, y7).predict(X6) # QDA shouldn't be able to separate those assert np.any(y_pred3 != y7) # Classes should have at least 2 elements assert_raises(ValueError, clf.fit, X6, y4)
def test(): for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariances=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis') plt.show()
o_dim=20, g_size=2) print("Time elapsed", start - time.time()) print("Dimension reduced shape", Train.shape) ## Linear Discriminant Analysis from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(Train, train_labels).predict(Train) print("1 -- LDA") from sklearn.metrics import accuracy_score print(accuracy_score(train_labels, y_pred, normalize=True, sample_weight=None)) qda = QuadraticDiscriminantAnalysis(store_covariances=True) y_pred = qda.fit(Train, train_labels).predict(Train) print("2 -- QDA") from sklearn.metrics import accuracy_score print(accuracy_score(train_labels, y_pred, normalize=True, sample_weight=None)) from sklearn.naive_bayes import GaussianNB print("3 -- GaussianNB") gnb = GaussianNB() y_pred = gnb.fit(Train, train_labels).predict(Train) from sklearn.metrics import accuracy_score print(accuracy_score(train_labels, y_pred, normalize=True, sample_weight=None)) from sklearn.svm import SVC print("4 -- SVC") clf = SVC(kernel="linear", C=0.025) y_pred = clf.fit(Train, train_labels).predict(Train)
def discriminatePlot(X, y, cVal, titleStr=''): # Frederic's Robust Wrapper for discriminant analysis function. Performs lda, qda and RF afer error checking, # Generates nice plots and returns cross-validated # performance, stderr and base line. # X np array n rows x p parameters # y group labels n rows # rgb color code for each data point - should be the same for each data beloging to the same group # titleStr title for plots # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses # Global Parameters CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # Initialize Variables and clean up data classes, classesCount = np.unique( y, return_counts=True ) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) yGood = y[goodInd] XGood = X[goodInd] cValGood = cVal[goodInd] classes, classesCount = np.unique(yGood, return_counts=True) nClasses = classes.size # Number of classes or groups # Do we have enough data? if (nClasses < 2): print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % ( MINCOUNT) return -1, -1, -1, -1, -1, -1, -1 cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % ( cvFolds, CVFOLDS) # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X cClasses = [] # Color code for each class for cl in classes: icl = (yGood == cl).nonzero()[0][0] cClasses.append(np.append(cValGood[icl], 1.0)) cClasses = np.asarray(cClasses) myPrior = np.ones(nClasses) * (1.0 / nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. nDmax = int(np.fix(np.sqrt(nX / 5))) if nDmax < nD: print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print 'Variance explained is %.2f%%' % ( sum(pca.explained_variance_ratio_) * 100.0) # Initialise Classifiers ldaMod = LDA(n_components=min(nDmax, nClasses - 1), priors=myPrior, shrinkage=None, solver='svd') qdaMod = QDA(priors=myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaScores = np.zeros(cvFolds) qdaScores = np.zeros(cvFolds) rfScores = np.zeros(cvFolds) skf = cross_validation.StratifiedKFold(yGood, cvFolds) iskf = 0 for train, test in skf: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array( [b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses) * (1.0 / ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) iskf += 1 if (iskf != cvFolds): cvFolds = iskf ldaScores.reshape(cvFolds) qdaScores.reshape(cvFolds) rfScores.reshape(cvFolds) # Refit with all the data for the plots ldaMod.priors = myPrior qdaMod.priors = myPrior Xrr = ldaMod.fit_transform(Xr, yGood) # Check labels for a, b in zip(classes, ldaMod.classes_): if a != b: print 'Error in ldaPlot: labels do not match' # Print the coefficients of first 3 DFA print 'LDA Weights:' print 'DFA1:', ldaMod.coef_[0, :] if nClasses > 2: print 'DFA2:', ldaMod.coef_[1, :] if nClasses > 3: print 'DFA3:', ldaMod.coef_[2, :] # Obtain fits in this rotated space for display purposes ldaMod.fit(Xrr, yGood) qdaMod.fit(Xrr, yGood) rfMod.fit(Xrr, yGood) XrrMean = Xrr.mean(0) # Make a mesh for plotting x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1)) xm1 = np.reshape(x1, -1) xm2 = np.reshape(x2, -1) nxm = np.size(xm1) Xm = np.zeros((nxm, Xrr.shape[1])) Xm[:, 0] = xm1 if Xrr.shape[1] > 1: Xm[:, 1] = xm2 for ix in range(2, Xrr.shape[1]): Xm[:, ix] = np.squeeze(np.ones((nxm, 1))) * XrrMean[ix] XmcLDA = np.zeros((nxm, 4)) # RGBA values for color for LDA XmcQDA = np.zeros((nxm, 4)) # RGBA values for color for QDA XmcRF = np.zeros((nxm, 4)) # RGBA values for color for RF # Predict values on mesh for plotting based on the first two DFs yPredLDA = ldaMod.predict_proba(Xm) yPredQDA = qdaMod.predict_proba(Xm) yPredRF = rfMod.predict_proba(Xm) # Transform the predictions in color codes maxLDA = yPredLDA.max() for ix in range(nxm): cWeight = yPredLDA[ix, :] # Prob for all classes cWinner = ( (cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcLDA[ix, :] = np.dot(cWinner, cClasses) XmcLDA[ix, 3] = cWeight.max() / maxLDA # Plot the surface of probability plt.figure(facecolor='white', figsize=(10, 3)) plt.subplot(131) Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1], 4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:, 0], Xrr[:, 1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr, (np.random.rand(Xrr.size) - 0.5) * 12.0, c=cValGood, s=40, zorder=1) plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean() * 100.0))) plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') # Transform the predictions in color codes maxQDA = yPredQDA.max() for ix in range(nxm): cWeight = yPredQDA[ix, :] # Prob for all classes cWinner = ( (cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcQDA[ix, :] = np.dot(cWinner, cClasses) XmcQDA[ix, 3] = cWeight.max() / maxQDA # Plot the surface of probability plt.subplot(132) Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1], 4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:, 0], Xrr[:, 1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr, (np.random.rand(Xrr.size) - 0.5) * 12.0, c=cValGood, s=40, zorder=1) plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean() * 100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) # Transform the predictions in color codes maxRF = yPredRF.max() for ix in range(nxm): cWeight = yPredRF[ix, :] # Prob for all classes cWinner = ( (cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses # Weighted colors does not work XmcRF[ix, :] = np.dot(cWinner, cClasses) XmcRF[ix, 3] = cWeight.max() / maxRF # Plot the surface of probability plt.subplot(133) Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1], 4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:, 0], Xrr[:, 1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr, (np.random.rand(Xrr.size) - 0.5) * 12.0, c=cValGood, s=40, zorder=1) plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean() * 100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.show() # Results ldaScore = ldaScores.mean() * 100.0 qdaScore = qdaScores.mean() * 100.0 rfScore = rfScores.mean() * 100.0 ldaScoreSE = ldaScores.std() * 100.0 qdaScoreSE = qdaScores.std() * 100.0 rfScoreSE = rfScores.std() * 100.0 print("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0 / nClasses) print("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE) print("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE) print("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE) return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
def train(self): """ Scale the Dataframe, depends on the model, encode the features, Wavelet Transformation, quadratic discriminant analysis, chunk our data into X and y samples, Fit the Neural Network. """ df = self.df self.scaler = MinMaxScaler() if self.model == 'encoder' : self.scaler.fit(df) df[df.columns] = self.scaler.transform(df) self.autoencoder = autoencoder(self.features) X = np.array(self.df.drop(columns = 'up')) X = X.reshape(len(X), 1, self.features) es = EarlyStopping(monitor = 'val_loss',mode = 'min' , verbose = 1, patience = 20, restore_best_weights = True) self.autoencoder.fit(X,X, validation_split = 0.3, callbacks = [es], epochs = 1000, batch_size = 64, shuffle = True) X_encode = self.autoencoder.predict(X) X_encode.shape = (X_encode.shape[0], X_encode.shape[2]) new_df = pd.DataFrame(X_encode) df.reset_index(inplace = True) new_df['up'] = df['up'] X_train, y_train = get_X_y(new_df, self.n_days, self.length , self.style) elif self.model == 'wav' : for col_ in df.drop(columns = 'up').columns : df[col_] = wavelet_transform(df[col_])[:len(df)] self.scaler.fit(df) df[df.columns] = self.scaler.transform(df) X_train, y_train = get_X_y(df, self.n_days, self.length , self.style) elif self.model = 'qda' : self.scaler.fit(df) df[df.columns] = self.scaler.transform(df) X_train, y_train = get_X_y(df, self.n_days, self.length , self.style) clf = QuadraticDiscriminantAnalysis() train = [] for s in range(len(X_train)) : train.append(X_train[s].ravel()) X_train = train clf.fit(X_train, y_train) train = [] for s in range(len(X_train)) : quad_dis = clf.predict(X_train[s].reshape(X_train[s].shape[0],1)) train.append(quad_dis.reshape(self.length, self.features)) X_train = train
colsample_bytree=1.0, max_depth=5, gamma=1, min_child_weight=1) L_model7 = KNeighborsClassifier(2) R_model1 = SGDClassifier(loss='squared_hinge', penalty='none', alpha=0.001) R_model2 = DecisionTreeClassifier(max_depth=17, min_samples_split=10) R_model3 = AdaBoostClassifier(n_estimators=50, learning_rate=1) R_model4 = GaussianNB() R_model5 = QuadraticDiscriminantAnalysis() R_model6 = xgb.XGBClassifier(random_state=1, learning_rate=0.01, subsample=0.6, colsample_bytree=1.0, max_depth=5, gamma=1, min_child_weight=1) R_model7 = KNeighborsClassifier(2) L_model1.fit(X_train_left, y_train_left) L_model2.fit(X_train_left, y_train_left) L_model3.fit(X_train_left, y_train_left) L_model4.fit(X_train_left, y_train_left) L_model5.fit(X_train_left, y_train_left) L_model6.fit(X_train_left, y_train_left) L_model7.fit(X_train_left, y_train_left) R_model1.fit(X_train_right, y_train_right) R_model2.fit(X_train_right, y_train_right) R_model3.fit(X_train_right, y_train_right) R_model4.fit(X_train_right, y_train_right) R_model5.fit(X_train_right, y_train_right) R_model6.fit(X_train_right, y_train_right) R_model7.fit(X_train_right, y_train_right) joblib.dump(L_model1, 'model/L_model1.pkl') joblib.dump(L_model2, 'model/L_model2.pkl') joblib.dump(L_model3, 'model/L_model3.pkl') joblib.dump(L_model4, 'model/L_model4.pkl') joblib.dump(L_model5, 'model/L_model5.pkl')
rfc.score(X_test, y_test))) # In[37]: ABC = AdaBoostClassifier(n_estimators=100) ABC.fit(X_train, y_train) print('Accuracy of ABC classifier on training set: {:.2f}'.format( ABC.score(X_train, y_train))) print('Accuracy of ABC classifier on test set: {:.2f}'.format( ABC.score(X_test, y_test))) # In[38]: from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis Qda = QuadraticDiscriminantAnalysis() Qda.fit(X_train, y_train) print('Accuracy of QDA classifier on training set: {:.2f}'.format( Qda.score(X_train, y_train))) print('Accuracy of QDA classifier on test set: {:.2f}'.format( Qda.score(X_test, y_test))) # In[39]: from sklearn.gaussian_process import GaussianProcessClassifier GPC = GaussianProcessClassifier() GPC.fit(X_train, y_train) print('Accuracy of GPC classifier on training set: {:.2f}'.format( GPC.score(X_train, y_train))) print('Accuracy of GPC classifier on test set: {:.2f}'.format( GPC.score(X_test, y_test)))
splot.set_xticks(()) splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') ############################################################################### for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis() y_pred = qda.fit(X, y, store_covariances=True).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis') plt.show()
Xt = [] Yt = [] acc = [] for i in train: X.append(i[:len(i) - 1]) Y.append(i[len(i) - 1:][0]) for i in test: Xt.append(i[:len(i) - 1]) Yt.append(i[len(i) - 1:][0]) print(len(Y)) print(len(Yt)) #Create a svm Classifier clf = QuadraticDiscriminantAnalysis() # Linear Kernel #Train the model using the training sets clf.fit(X, Y) #Predict the response for test dataset y_pred = clf.predict(Xt) count = 0 j = 0 for i in y_pred: if int(i) == int(Yt[j]): count = count + 1 j += 1 print(count) print((count / len(Yt)) * 100) acc.append((count / len(Yt)) * 100) print("=== Confusion Matrix ===") print(confusion_matrix(Yt, y_pred)) print('\n')
def QuadDA(X_train, y_train, X_test, y_test): clf = QDA() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) return accuracy
lda.fit(features_train2, labels_train2) predict2 = lda.predict(features_test2) # Training accuracy print("The training accuracy is: ") print(accuracy_score(labels_train2, lda.predict(features_train2))) # Test accuracy print("The test accuracy is: ") print(accuracy_score(labels_test2, predict2)) #%% QDA for comparison Part 1 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA qda = QDA() qda.fit(features_train1, labels_train1) predict1 = qda.predict(features_test1) # Training accuracy print("The training accuracy is: ") print(accuracy_score(labels_train1, qda.predict(features_train1))) # Test accuracy print("The test accuracy is: ") print(accuracy_score(labels_test1, predict1)) #%% QDA for comparison Part 2 qda = QDA() qda.fit(features_train2, labels_train2) predict2 = qda.predict(features_test2)
def discriminatePlot(X, y, cVal, titleStr=''): # Frederic's Robust Wrapper for discriminant analysis function. Performs lda, qda and RF afer error checking, # Generates nice plots and returns cross-validated # performance, stderr and base line. # X np array n rows x p parameters # y group labels n rows # rgb color code for each data point - should be the same for each data beloging to the same group # titleStr title for plots # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses # Global Parameters CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # Initialize Variables and clean up data classes, classesCount = np.unique(y, return_counts = True) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) yGood = y[goodInd] XGood = X[goodInd] cValGood = cVal[goodInd] classes, classesCount = np.unique(yGood, return_counts = True) nClasses = classes.size # Number of classes or groups # Do we have enough data? if (nClasses < 2): print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT) return -1, -1, -1, -1 , -1, -1, -1 cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS) # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X cClasses = [] # Color code for each class for cl in classes: icl = (yGood == cl).nonzero()[0][0] cClasses.append(np.append(cValGood[icl],1.0)) cClasses = np.asarray(cClasses) myPrior = np.ones(nClasses)*(1.0/nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. nDmax = int(np.fix(np.sqrt(nX/5))) if nDmax < nD: print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0) # Initialise Classifiers ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') qdaMod = QDA(priors = myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaScores = np.zeros(cvFolds) qdaScores = np.zeros(cvFolds) rfScores = np.zeros(cvFolds) skf = cross_validation.StratifiedKFold(yGood, cvFolds) iskf = 0 for train, test in skf: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) iskf += 1 if (iskf != cvFolds): cvFolds = iskf ldaScores.reshape(cvFolds) qdaScores.reshape(cvFolds) rfScores.reshape(cvFolds) # Refit with all the data for the plots ldaMod.priors = myPrior qdaMod.priors = myPrior Xrr = ldaMod.fit_transform(Xr, yGood) # Check labels for a, b in zip(classes, ldaMod.classes_): if a != b: print 'Error in ldaPlot: labels do not match' # Print the coefficients of first 3 DFA print 'LDA Weights:' print 'DFA1:', ldaMod.coef_[0,:] if nClasses > 2: print 'DFA2:', ldaMod.coef_[1,:] if nClasses > 3: print 'DFA3:', ldaMod.coef_[2,:] # Obtain fits in this rotated space for display purposes ldaMod.fit(Xrr, yGood) qdaMod.fit(Xrr, yGood) rfMod.fit(Xrr, yGood) XrrMean = Xrr.mean(0) # Make a mesh for plotting x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1)) xm1 = np.reshape(x1, -1) xm2 = np.reshape(x2, -1) nxm = np.size(xm1) Xm = np.zeros((nxm, Xrr.shape[1])) Xm[:,0] = xm1 if Xrr.shape[1] > 1 : Xm[:,1] = xm2 for ix in range(2,Xrr.shape[1]): Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix] XmcLDA = np.zeros((nxm, 4)) # RGBA values for color for LDA XmcQDA = np.zeros((nxm, 4)) # RGBA values for color for QDA XmcRF = np.zeros((nxm, 4)) # RGBA values for color for RF # Predict values on mesh for plotting based on the first two DFs yPredLDA = ldaMod.predict_proba(Xm) yPredQDA = qdaMod.predict_proba(Xm) yPredRF = rfMod.predict_proba(Xm) # Transform the predictions in color codes maxLDA = yPredLDA.max() for ix in range(nxm) : cWeight = yPredLDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcLDA[ix,:] = np.dot(cWinner, cClasses) XmcLDA[ix,3] = cWeight.max()/maxLDA # Plot the surface of probability plt.figure(facecolor='white', figsize=(10,3)) plt.subplot(131) Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0))) plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') # Transform the predictions in color codes maxQDA = yPredQDA.max() for ix in range(nxm) : cWeight = yPredQDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcQDA[ix,:] = np.dot(cWinner, cClasses) XmcQDA[ix,3] = cWeight.max()/maxQDA # Plot the surface of probability plt.subplot(132) Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) # Transform the predictions in color codes maxRF = yPredRF.max() for ix in range(nxm) : cWeight = yPredRF[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses # Weighted colors does not work XmcRF[ix,:] = np.dot(cWinner, cClasses) XmcRF[ix,3] = cWeight.max()/maxRF # Plot the surface of probability plt.subplot(133) Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.show() # Results ldaScore = ldaScores.mean()*100.0 qdaScore = qdaScores.mean()*100.0 rfScore = rfScores.mean()*100.0 ldaScoreSE = ldaScores.std() * 100.0 qdaScoreSE = qdaScores.std() * 100.0 rfScoreSE = rfScores.std() * 100.0 print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses) print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE) print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE) print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE) return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
accuracy = printMetrics(actual, predicted) scores.append(accuracy) print(scores) exit ''' kfcv = kFoldCV() kfcv.kfold_evaluate(df) ''' #use built in QDA from sklearn from sklearn.datasets import load_breast_cancer data = load_breast_cancer() from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis df_sklearn = pd.DataFrame(data.data, columns=data.feature_names) df_sklearn['target']=pd.Series(data.target) x_var = df_sklearn.drop(('target'),axis=1) y_var = df_sklearn['target'] qda = QuadraticDiscriminantAnalysis() qda_fit = qda.fit(x_var,y_var) prediction = qda_fit.predict(x_var) print(prediction) print(1-qda.score(x_var,y_var))
dismodel.fit(X_train, Y_train) Y_train_pred = dismodel.predict(X_train) cmtr = confusion_matrix(Y_train, Y_train_pred) print("Confusion Matrix Training:\n", cmtr) acctr = accuracy_score(Y_train, Y_train_pred) print("Accurray Training:", acctr) Y_test_pred = dismodel.predict(X_test) cmte = confusion_matrix(Y_test, Y_test_pred) print("Confusion Matrix Testing:\n", cmte) accte = accuracy_score(Y_test, Y_test_pred) print("Accurray Test:", accte) report.loc[len(report)] = ['Linear Discriminant Analysis', acctr, accte] from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qdismodel = QuadraticDiscriminantAnalysis() qdismodel.fit(X_train, Y_train) Y_train_pred = qdismodel.predict(X_train) cmtr = confusion_matrix(Y_train, Y_train_pred) print("Confusion Matrix Training:\n", cmtr) acctr = accuracy_score(Y_train, Y_train_pred) print("Accurray Training:", acctr) Y_test_pred = qdismodel.predict(X_test) cmte = confusion_matrix(Y_test, Y_test_pred) print("Confusion Matrix Testing:\n", cmte) accte = accuracy_score(Y_test, Y_test_pred) print("Accurray Test:", accte) report.loc[len(report)] = ['Quadratic Discriminant Analysis', acctr, accte] ################## # Neural Network #
# get training, validation and test datasets for specified roi training_data, validation_data, test_data = ds.split_data() ########################################################################### # # CREATE MODEL # ########################################################################### # Define the estimator: quadratic discriminant analysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis() qda.fit(training_data[0], training_data[1]) from sklearn.metrics import accuracy_score # record the best result accuracies[i] = accuracy_score(test_data[1], qda.predict(test_data[0])) mean_accuracy = accuracies.mean() print("\n\nmean accuracy: %f" % mean_accuracy) ############################################################################### # # VISUALIZE # ###############################################################################
def traditional_models (X_train, y_train, X_test, y_test, pos_label=None): """ Applies logistic regression :param X_train: Training Set Predictors :param X_test: Test Set Predictors :param y_train: Test Set response :param y_test: Test Set response :return: Dataframe with ML technique """ # Logistic regression cvals = [1e-20, 1e-15, 1e-10, 1e-5, 1e-3, 1e-1, 1, 10, 100, 10000, 100000] logregcv = LogisticRegressionCV(Cs=cvals, cv=5) logregcv.fit(X_train, y_train) yhat = logregcv.predict(X_test) logreg_acc = accuracy_score(y_test, yhat) logreg_dacc = directional_accuracy(y_test, yhat) fpr_log, tpr_log, thresholds = metrics.roc_curve( y_test, logregcv.predict_proba(X_test)[:, 1], pos_label=pos_label) logreg_auc = auc(fpr_log, tpr_log) # knn ks = [2**x for x in range(2, 8)] cv_scores = [] for k in ks: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, X_train, y_train, cv=5, scoring="accuracy") cv_scores.append(scores.mean()) opt_k = ks[np.argmax(cv_scores)] # print('The optimal value for k is %d, with a score of %.3f.' # % (opt_k, cv_scores[np.argmax(cv_scores)])) knn = KNeighborsClassifier(n_neighbors=opt_k) scores = cross_val_score(knn, X_train, y_train, cv=5) knn.fit(X_train, y_train) yhat = knn.predict(X_test) knn_acc = accuracy_score(y_test, yhat) knn_dacc = directional_accuracy(y_test, yhat) # Calculating auc on testset fpr_knn, tpr_knn, thresholds = metrics.roc_curve( y_test, knn.predict_proba(X_test)[:, 1], pos_label=pos_label) knn_auc = auc(fpr_knn, tpr_knn) # LDA lda = LinearDiscriminantAnalysis() scores = cross_val_score(lda, X_train, y_train, cv=5) lda.fit(X_train, y_train) yhat = lda.predict(X_test) lda_acc = accuracy_score(y_test, yhat) lda_dacc = directional_accuracy(y_test, yhat) # Calculating auc on testset fpr_lda, tpr_lda, thresholds = metrics.roc_curve( y_test, lda.predict_proba(X_test)[:, 1], pos_label=pos_label) lda_auc = auc(fpr_lda, tpr_lda) # QDA qda = QuadraticDiscriminantAnalysis() scores = cross_val_score(qda, X_train, y_train, cv=5) qda.fit(X_train, y_train) yhat = qda.predict(X_test) qda_acc = accuracy_score(y_test, yhat) qda_dacc = directional_accuracy(y_test, yhat) # Calculating auc on testset fpr_qda, tpr_qda, thresholds = metrics.roc_curve( y_test, qda.predict_proba(X_test)[:, 1], pos_label=pos_label) qda_auc = auc(fpr_qda, tpr_qda) # Random Forest tree_cnts = [2**i for i in range(1, 9)] # List to hold the results. cv_scores = [] for tree_cnt in tree_cnts: # Train the RF model, note that sqrt(p) is the default # number of predictors, so it isn't specified here. rf = RandomForestClassifier(n_estimators=tree_cnt) scores = cross_val_score(rf, X_train, y_train, cv=5) cv_scores.append([tree_cnt, scores.mean()]) cv_scores = np.array(cv_scores) opt_tree_cnt = int(cv_scores[np.argmax(np.array(cv_scores)[:, 1])][0]) rf = RandomForestClassifier(n_estimators=opt_tree_cnt) scores = cross_val_score(rf, X_train, y_train, cv=5) rf.fit(X_train, y_train) yhat = rf.predict(X_test) rf_acc = accuracy_score(y_test, yhat) rf_dacc = directional_accuracy(y_test, yhat) # Calculating auc on testset fpr_rf, tpr_rf, thresholds = metrics.roc_curve( y_test, rf.predict_proba(X_test)[:, 1], pos_label=pos_label) rf_auc = auc(fpr_rf, tpr_rf) # ADA Boost td = [1, 2] trees = [2**x for x in range(1, 8)] param_grid = {"n_estimators":trees, "max_depth":td, "learning_rate":[0.05] } p = np.zeros((len(trees)*len(td), 3)) k = 0 for i in range(0, len(trees)): for j in range(0, len(td)): ada = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=td[j]), n_estimators=trees[i], learning_rate=.05) p[k, 0] = trees[i] p[k, 1] = td[j] p[k, 2] = np.mean(cross_val_score(ada, X_train, y_train, cv=5)) k = k + 1 x = pd.DataFrame(p) x.columns = ['ntree', 'depth', 'cv_score'] p = x.ix[x['cv_score'].argmax()] ada = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=p[1]), n_estimators=int(p[0]), learning_rate=.05) ada.fit(X_train, y_train) yhat = ada.predict(X_test) ada_acc = accuracy_score(y_test, yhat) ada_dacc = directional_accuracy(y_test, yhat) # Calculating auc on testset fpr_ada, tpr_ada, thresholds = metrics.roc_curve( y_test, ada.predict_proba(X_test)[:, 1], pos_label=pos_label) ada_auc = auc(fpr_ada, tpr_ada) # Support Vector Classification svc = svm.SVC(kernel='rbf', random_state=0, gamma=1, C=1, probability=True) # scores = cross_val_score(svc, X_train, y_train, cv=5) svc.fit(X_train, y_train) yhat = svc.predict_proba(X_test)[:, 1] svm_acc = accuracy_score(y_test, yhat>0.5) svm_dacc = directional_accuracy(y_test, yhat) # Calculating auc on testset fpr_svm, tpr_svm, thresholds = metrics.roc_curve( y_test, svc.predict_proba(X_test)[:, 1], pos_label=pos_label) svm_auc = auc(fpr_svm, tpr_svm) x = pd.DataFrame({'Accuracy':[logreg_acc, knn_acc, lda_acc, qda_acc, rf_acc, ada_acc, svm_acc], 'AUC':[logreg_auc, knn_auc, lda_auc, qda_auc, rf_auc, ada_auc, svm_auc], 'D_Accuracy':[logreg_dacc, knn_dacc, lda_dacc, qda_dacc, rf_dacc, ada_dacc, svm_dacc]}, index=['LogReg', 'KNN', 'LDA', 'QDA', 'RandomForest', 'ADABoost', 'SVM']) return x
# First, split data in training and validation sets X_train, X_test, y_train, y_test = train_test_split(X, class_label, random_state=random_state) # Initialize Classificators clf_1 = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') clf_2 = AdaBoostClassifier(n_estimators=100) clf_3 = linear_model.LinearRegression() clf_4 = LinearDiscriminantAnalysis() clf_5 = QuadraticDiscriminantAnalysis() # Test classificator with training and validation data y_pred_1 = clf_1.fit(X_train, y_train).predict(X_test) y_pred_2 = clf_2.fit(X_train, y_train).predict(X_test) y_pred_3 = clf_3.fit(X_train, y_train).predict(X_test) y_pred_4 = clf_4.fit(X_train, y_train).predict(X_test) y_pred_5 = clf_5.fit(X_train, y_train).predict(X_test) ''' Z_1 = clf_1.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_2 = clf_2.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_3 = clf_3.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_3 = Z_3.reshape(xx.shape) Z_4 = clf_4.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_5 = clf_5.decision_function(np.c_[xx.ravel(), yy.ravel()]) ''' # Obtain confusion matrices cm_1 += confusion_matrix(y_test, y_pred_1) cm_2 += confusion_matrix(y_test, y_pred_2) # cm_3 += confusion_matrix(y_test, y_pred_3) cm_4 += confusion_matrix(y_test, y_pred_4)
# Gender Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male' ] # Stores the decision tree model from sklearn clf = tree.DecisionTreeClassifier() clf2 = KNeighborsClassifier(n_neighbors=2) clf3 = QuadraticDiscriminantAnalysis() # The result is stored in the updated clf variable # fit method trains the decision tree on the dataset clf = clf.fit(X, Y) clf2 = clf2.fit(X, Y) clf3 = clf3.fit(X, Y) # DecisionTreeClassifier Predictions # predict method gives a result based on the pretrained data-set. pred = clf.predict([[175, 60, 38]]) # Prints female print "DecisionTreeClassifier", pred pred2 = clf.predict([[165, 70, 38]]) # Prints female print "DecisionTreeClassifier", pred2 # KNeighborsClassifier Predictions pred3 = clf2.predict([[175, 60, 38]]) # prints female print "KNeighborsClassifier", pred3 pred4 = clf2.predict([[165, 70, 38]])
class road_estimation: def __init__(self, model_selection): self._train_data, self._train_targets, self._valid_data, self._valid_targets, self._test_data, self._test_targets = ( data_load() ) self._model_selection = model_selection self._classifier = [] def train(self): if self._model_selection == "svm": # selected the svc in svm self._classifier = svm.SVC() elif self._model_selection == "nb": self._classifier = GaussianNB() elif self._model_selection == "knn": # parameter n_jobs can be set to -1 to enable parallel calculating self._classifier = KNeighborsClassifier(n_neighbors=7) elif self._model_selection == "ada": # Bunch of parameters, n_estimators, learning_rate self._classifier = AdaBoostClassifier() elif self._model_selection == "rf": # many parameters including n_jobs self._classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) elif self._model_selection == "qda": # complicated array like parameters, perhaps leave it default self._classifier = QuadraticDiscriminantAnalysis() else: print "Please refer to one classifier" self._classifier.fit(self._train_data, self._train_targets) # predict on valid data prediction_valid = self._classifier.predict(self._valid_data) # print validation result for selected model. print ( "Classification report for classifier %s on valid_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._valid_targets, prediction_valid)) ) def test(self): # predict on test data prediction_test = self._classifier.predict(self.test_data) # print test result for selected model. print ( "Classification report for classifier %s on test_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._test_targets, prediction_test)) ) def showPredictionImage(self): f = Feature() f.loadImage("um_000000.png") f.extractFeatures() fea_matrix = f.getFeaturesVectors() predict = self._classifier.predict(fea_matrix) image = np.copy(f.image) num_superpixels = np.max(f.superpixel) + 1 for i in xrange(0, num_superpixels): indices = np.where(f.superpixel == i) if predict[i] == 1: image[indices[0], indices[1], 0] = 1 image[indices[0], indices[1], 1] = 1 image[indices[0], indices[1], 2] = 0 plt.imshow(image) plt.show() # show prediction image with superpixels plt.imshow(mark_boundaries(image, superpixels)) plt.show()
index_col=0, parse_dates=True) print(df.head()) print("=" * 25) #-------------------- X_train = df[:'2004'][['Lag1', 'Lag2']] y_train = df[:'2004']['Direction'] X_test = df['2005':][['Lag1', 'Lag2']] y_test = df['2005':]['Direction'] #---------------------------------------------------- lda = LinearDiscriminantAnalysis() model = lda.fit(X_train, y_train) pred = model.predict(X_test) print('Prediction for LDA : ', np.unique(pred, return_counts=True)) print('CM for LDA : \n', confusion_matrix(pred, y_test)) print('Report for LDA \n: ', classification_report(y_test, pred, digits=3)) print("=" * 25) #---------------------------------------------------- qda = QuadraticDiscriminantAnalysis() model2 = qda.fit(X_train, y_train) pred2 = model2.predict(X_test) print('Prediction for QDA : ', np.unique(pred2, return_counts=True)) print('CM for QDA : \n', confusion_matrix(pred2, y_test)) print('Report for QDA : \n', classification_report(y_test, pred2, digits=3)) print("=" * 25) #----------------------------------------------------
comps = pca.fit_transform(data) plt.plot(pca.components_.reshape((2,data.shape[0],data.shape[1]))) #plt.plot(pca.explained_variance_, linewidth=2) #plt.title('Principal Component Analysis (PCA) Feature Assessment') # Creation of labels labels = [] for i in range(0,27): labels.append(1) for i in range(27,53): labels.append(2) # LDA model lda = QuadraticDiscriminantAnalysis() lda.fit(comps, labels) y_pred = lda.predict(comps) print(labels) print(y_pred) mcc = matthews_corrcoef(labels,y_pred) print("MCC="+str(mcc)) # Plotting LDA contour nx, ny = 200, 100 x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0]) y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny)) Z = lda.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) plt.contour(xx, yy, Z, [0.5], linewidths=5, colors = 'k', linestyles = 'dashed')
class QuadraticIQDiscriminator(IQDiscriminationFitter): """Quadratic discriminant analysis discriminator for IQ data.""" def __init__(self, cal_results: Union[Result, List[Result]], qubit_mask: List[int], expected_states: List[str] = None, standardize: bool = False, schedules: Union[List[str], List[Schedule]] = None, discriminator_parameters: dict = None): """ Args: cal_results (Union[Result, List[Result]]): calibration results, Result or list of Result used to fit the discriminator. qubit_mask (List[int]): determines which qubit's level 1 data to use in the discrimination process. expected_states (List[str]): a list that should have the same length as schedules. All results in cal_results are used if schedules is None. expected_states must have the corresponding length. standardize (bool): if true the discriminator will standardize the xdata using the internal method _scale_data. schedules (Union[List[str], List[Schedule]]): The schedules or a subset of schedules in cal_results used to train the discriminator. The user may also pass the name of the schedules instead of the schedules. If schedules is None, then all the schedules in cal_results are used. discriminator_parameters (dict): parameters for Sklearn's LDA. Raises: ImportError: If scikit-learn is not installed """ if not discriminator_parameters: discriminator_parameters = {} store_cov = discriminator_parameters.get('store_covariance', False) tol = discriminator_parameters.get('tol', 1.0e-4) if not HAS_SKLEARN: raise ImportError("To use the QuadraticIQDiscriminator class " "scikit-learn needs to be installed. This can " "be done with 'pip install scikit-learn'") self._qda = QuadraticDiscriminantAnalysis(store_covariance=store_cov, tol=tol) # Also sets the x and y data. IQDiscriminationFitter.__init__(self, cal_results, qubit_mask, expected_states, standardize, schedules) self._description = 'Quadratic IQ discriminator for measurement ' \ 'level 1.' self.fit() def fit(self): """Fits the discriminator using self._xdata and self._ydata.""" if len(self._xdata) == 0: return self._qda.fit(self._xdata, self._ydata) self._fitted = True def discriminate(self, x_data: List[List[float]]) -> List[str]: """Applies the discriminator to x_data. Args: x_data (List[List[float]]): list of features. Each feature is itself a list. Returns: The discriminated x_data as a list of labels. """ return self._qda.predict(x_data)
lin_clf = svm.LinearSVC() lin_clf.fit(image_stack, y_training.ravel()) ##print lin_clf.predict(image_stack_test) clf_neu = MLPClassifier(solver='lbfgs', alpha=1e-5) clf_neu.fit(image_stack, y_training.ravel()) ##print clf_neu.predict(image_stack_test) clf_rand = RandomForestClassifier() clf_rand.fit(image_stack, y_training.ravel()) clf_lind = LinearDiscriminantAnalysis() clf_lind.fit(image_stack, y_training.ravel()) clf_quad = QuadraticDiscriminantAnalysis() clf_quad.fit(image_stack, y_training.ravel()) clf_dt = DecisionTreeClassifier() clf_adaboost = AdaBoostClassifier(n_estimators=100) clf_nb = GaussianNB() reg_lasso = linear_model.Lasso(alpha=0.1) ##scaler = StandardScaler() ##scaler.fit(image_stack) ##image_stack = scaler.transform(image_stack) ##image_stack_test = scaler.transform(image_stack_test) print(cross_val_score(neigh, image_stack, y_training.ravel(), cv=5)) print sum(cross_val_score(neigh, image_stack, y_training.ravel(), cv=5)) / 5 print(cross_val_score(clf, image_stack, y_training.ravel(), cv=5))
test = pd.read_csv("test.csv") #Select features and parse the result to pandas dataframe X_test = pd.DataFrame(test.loc[:, features].values) #Load targets for test submission = pd.read_csv("gender_submission.csv") #Select the target column Y_test = submission.loc[:, "Survived"].values #Slpit train data into features and targets for train X_train = pd.DataFrame(train.loc[:, features].values) Y_train = train.loc[:, "Survived"].values #Data encoding : since machine learning work just with number we're going to parse strings to numeric values using Label Encoder le = preprocessing.LabelEncoder() X_train = X_train.apply(le.fit_transform) X_test = X_test.apply(le.fit_transform) #Create a Quadratic Discriminant Analysis instance classifier = QuadraticDiscriminantAnalysis() #Fit the classifier classifier.fit(X_train, Y_train) #Calculate the score (Accuracy) score = classifier.score(X_test, Y_test) #Printing the score print(score)
lda_pred_posterior = lda_clf.predict_proba(test_df[predictors]) print(lda_pred_posterior) print(np.sum(lda_pred_posterior[:, 0] >= 0.5)) print(np.sum(lda_pred_posterior[:, 1] >= 0.5)) print(lda_pred_posterior[:20, 0]) print(test_df.lda_pred_class[:20]) print(np.sum(lda_pred_posterior[:, 0] >= 0.9)) print(np.max(lda_pred_posterior[:, 0])) # QUADRATIC DISCRIMINANT ANALYSIS from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA # Create Classifier Instance qda_clf = QDA() # Fit model qda_clf.fit(X_train, Y_train) print('Class Priors (P(Y = k)) =', qda_clf.priors_) print('Class Means μk\n Down:', qda_clf.means_[0], '\n Up: ', lda_clf.means_[1]) qda_pred_class_coded = qda_clf.predict(test_df[predictors]) qda_pred_class = ['Up' if c == 1 else 'Down' for c in qda_pred_class_coded] test_df['qda_pred_class'] = qda_pred_class print('The model makes {0:.4f} correct predictions'.format( 100 * np.mean(test_df.qda_pred_class == test_df.Direction))) # Compute Test Confusion Matrix # ################################# table = pd.crosstab(test_df.qda_pred_class, test_df.Direction) print(table)
from sklearn import tree from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #[height, weight, shoe size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'female', 'female', 'female', 'male,', 'male', 'male', 'female', 'male', 'female', 'male' ] clf = tree.DecisionTreeClassifier() clf = QuadraticDiscriminantAnalysis() clf = clf.fit(X, Y) prediction = clf.predict([[190, 70, 43]]) print(prediction)
ytrain = np.concatenate((y1[:j], y1[j + n // 3:])) sgd.fit(xtrain[:1000], ytrain[:1000]) j += n // 3 print(sgd.score(xtest, ytest)) from sklearn.metrics import plot_confusion_matrix plot_confusion_matrix(sgd, xtest, ytest, normalize='pred') sns.set(font_scale=0.75) plt.show() from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qd = QuadraticDiscriminantAnalysis() qd.fit(xtrain, ytrain) qd.score(xtest, ytest) qd.score(xtrain, ytrain) n = len(x) j = 0 x1 = x.to_numpy() y1 = y.to_numpy() for i in range(3): xtest = x1[j:j + n // 3] ytest = y1[j:j + n // 3] xtrain = np.concatenate((x1[:j], x1[j + n // 3:])) ytrain = np.concatenate((y1[:j], y1[j + n // 3:])) qd.fit(xtrain[:1000], ytrain[:1000])
def main(): if (sys.argv[1] == 'generate'): n_pts1 = 100 ;n_pts2 = 1000 # Data parameters mean_1 = [0, 0] cov_1 = np.array([[1, 0], [0, 1]]) mean_2 = [3, -1] cov_2 = np.array([[1, 0], [0, 5]]) # Create and save data val_1 = np.random.multivariate_normal(mean=mean_1, cov=cov_1, size=(n_pts1, )) val_2 = np.random.multivariate_normal(mean=mean_2, cov=cov_2, size=(n_pts2, )) label_1 = np.array(([0] * n_pts1), dtype=int) label_2 = np.array(([1] * n_pts2), dtype=int) np.savetxt("./data1.txt", val_1) np.savetxt("./data2.txt", val_2) np.savetxt("./data1_label.txt", label_1) np.savetxt("./data2_label.txt", label_2) print val_1 print val_2 sys.exit("Exiting!") elif (sys.argv[1] == 'run'): # Load data from file val_1 = np.loadtxt("./data1.txt") val_2 = np.loadtxt("./data2.txt") label_1 = np.loadtxt("./data1_label.txt") label_2 = np.loadtxt("./data2_label.txt") # Get data in requred form X = np.vstack((val_1, val_2)) y = np.hstack((label_1, label_2)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Start training lda_first = LinearDiscriminantAnalysis(solver='svd', store_covariance=True) qda_first = QuadraticDiscriminantAnalysis(store_covariances=True) lda_first.fit(X=X_train, y=y_train) qda_first.fit(X=X_train, y=y_train) # Print the variables print_data(lda_first, qda_first, X_test, y_test) # Plot decision boundry PLOT_DB(lda_first, qda_first, X, y, X_test, y_test) plt.show()
k=7) #iterate the k from 1 to 120. The max. accuracy comes at k=7 . fclass.fit(X_R2L, Y_R2L) true = fclass.get_support() fclasscolindex_R2L = [i for i, x in enumerate(true) if x] fclasscolname_R2L = list(colNames[i] for i in fclasscolindex_R2L) print('Features selected :', fclasscolname_R2L) features = newdf[fclasscolname_R2L].astype(float) features1 = newdf_test[fclasscolname_R2L].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf = QuadraticDiscriminantAnalysis() t0 = time() clf.fit(features, lab) tt = time() - t0 print("Classifier trained in {} seconds".format(round(tt, 3))) t0 = time() pred = clf.predict(features1) tt = time() - t0 print("Predicted in {} seconds".format(round(tt, 3))) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, lab1) print("Accuracy is {}.".format(round(acc, 4))) print( pd.crosstab(lab1, pred, rownames=['Actual attacks'], colnames=['Predicted attacks']))
##Logistic regression lgRg=linear_model.LogisticRegression() lgRg.fit(x_train,y_train) lgRg.score(x_test,y_test) ## Least squares support vector machines from sklearn import svm clf=svm.SVC() clf.fit(x_tclf = neighbors.KNeighborsClassifier(15, weights=weights) clf.fit(x_train, y_train) clf.score(x_test,y_test) #Quadratic Classifiers (Quadratic Discriminant Analysis) from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf=QuadraticDiscriminantAnalysis() clf.fit(x_train,y_train) clf.score(x_test,y_test) ## K-Nearest Neighbour from sklearn import neighbors clf = neighbors.KNeighborsClassifier(15, weights="") clf.fit(x_train, y_train) clf.score(x_test,y_test) ## Random Forest ## Neural Networks ## Learning Vector Quantization
X2 = df2018[feature].values X2 = StandardScaler().fit_transform(X2) y2 = df2018['Label'].values #y2 = le.fit_transform(df201['Label'].values) # Perform LDA classifier lda_classifier = LDA(n_components=2) lda_classifier.fit( X, y, ) y_pred1 = lda_classifier.predict(X2) # Perform QDA classifier qda_classifier = QDA() qda_classifier.fit(X, y) y_pred2 = qda_classifier.predict(X2) # Question 1: Equation for LDA w1 = round(lda_classifier.coef_[0][0], 4) w2 = round(lda_classifier.coef_[0][1], 4) w0 = round(lda_classifier.intercept_[0], 4) print('The equation is: g(x) =', w0, '+', w1, '* mean +', w2, '* std') # Question 2: Accuracy for year 2018 accuracy_lda = metrics.accuracy_score(y2, y_pred1) print('The accuracy for LDA classifier for year 2018 is', round(accuracy_lda, 2)) accuracy_qda = metrics.accuracy_score(y2, y_pred2) print('The accuracy for QDA classifier for year 2018 is', round(accuracy_qda, 2))
X = training.iloc[:,1:-1].values y = training['country_destination'].values """ # Use Discriminant Analysis from sklearn.discriminant_analysis import LinearDiscriminantAnalysis trans = LinearDiscriminantAnalysis(n_components=3) trans.fit(X,y) X = trans.transform(X) """ # Split Up Data x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None) # Train classifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf = QuadraticDiscriminantAnalysis(reg_param=0.00001) clf.fit(x_train,y_train) # Run Predictions from sklearn.metrics import confusion_matrix, accuracy_score y_preds = clf.predict(x_valid) print( confusion_matrix(y_valid,y_preds) ); print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f = open('qda_take1.txt', 'w') f.write( str(confusion_matrix(y_valid,y_preds)) ); f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f.write( "\nclf = QuadraticDiscriminantAnalysis(0.00001)" ); # Now on to final submission x_final = testing.iloc[:,1:].values y_final = clf.predict(x_final).reshape([62096,]); y_final = pd.DataFrame(y_final);
y = y.astype('int') #print(X) # In[4]: train_dataset = train.values X = train_dataset[:, 2:] y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) print('QuadraticDiscriminantAnalysis****************') qda = QuadraticDiscriminantAnalysis() print('fitting****************') qda_train = qda.fit(X, y) print('predicting on train****************') qda_X_prediction = qda.predict_proba(X)[:, 1] print('predicting on test****************') qda_X_test_prediction = qda.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate([qda_X_prediction, qda_X_test_prediction]) df['quadratic_discriminant_analysis'] = tr_te_concatenated print('final tr_te shape', df.shape) df.to_csv('quadratic_discriminant_analysis_tr_te.csv', index=False) print(df.head())
print("Процент верных предсказаний: %.1f%%" % (hit_rate1 * 100)) # Linear Discriminant model2 = LDA() model2.fit(x_train, y_train) # Обучение (подбор параметров модели) d['Predict_LDA'] = model2.predict(x_test) # Тест # Считаем процент правильно предсказанных направлений изменения цены: d["Correct_LDA"] = (1.0 + d['Predict_LDA'] * d["FACT"]) / 2.0 print(d) hit_rate2 = np.mean(d["Correct_LDA"]) print("Процент верных предсказаний: %.1f%%" % (hit_rate2 * 100)) # Qadrical Discriminant model3 = QDA() model3.fit(x_train, y_train) # Обучение (подбор параметров модели) d['Predict_QDA'] = model3.predict(x_test) # Тест # Считаем процент правильно предсказанных направлений изменения цены: d["Correct_QDA"] = (1.0 + d['Predict_QDA'] * d["FACT"]) / 2.0 print(d) hit_rate3 = np.mean(d["Correct_QDA"]) print("Процент верных предсказаний: %.1f%%" % (hit_rate3 * 100)) # Take AVG from result's rates = [hit_rate1, hit_rate2, hit_rate3] if (np.average(rates) > 0.5): change_sum = 1 else: change_sum = -1 # # Procent's
import window_s_p_ft as win from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.cross_validation import train_test_split total_score = 0 stop = 1000 for x in range(stop): clf = QuadraticDiscriminantAnalysis() data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec for s in data_train] data_test_labels = [s.spec for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test] clf.fit(data_train, data_train_labels) total_score += clf.score(data_test, data_test_labels) total_score = total_score / stop print("all") print(total_score) specs = ["FK", "FM", "MN", "OE"] for sp in specs: total_score = 0 for x in range(stop): clf = QuadraticDiscriminantAnalysis() data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec if s.spec == sp else "NOT " + sp for s in data_train] data_test_labels = [s.spec if s.spec == sp else "NOT " + sp for s in data_test] data_train = [s.grades for s in data_train]
plt.grid() # ### ¿Cómo se vería la frontera de clasificación usando un FDG? # In[4]: from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis N = 100 x, y = np.random.multivariate_normal(Mean, Cov, N).T x2, y2 = np.random.multivariate_normal(Mean2, Cov2, N).T X = np.r_[np.c_[x,y],np.c_[x2,y2]] Y = np.r_[np.ones((N,1)),np.zeros((N,1))] clf = QuadraticDiscriminantAnalysis() clf.fit(X,Y.flatten()) plt.scatter(X[:,0],X[:,1],c=Y.flatten(), cmap='Set2',alpha=0.5) h = .02 # step size in the mesh # create a mesh to plot in x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, cmap=plt.cm.Blues)
splot.add_artist(ell) splot.set_xticks(()) splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') ############################################################################### for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariances=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis') plt.show()
What is the training misclassification rate? """ lda1 = LDA(solver="svd", store_covariance=True) lda1.fit(warX,warY) my_lda_pred = pd.DataFrame() my_lda_pred["pred"] = ["No" if x == 0 else "Yes" for x in lda1.predict(warX)] my_lda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]] conf_lda = pd.crosstab(my_lda_pred["pred"], my_lda_pred["actual"]) conf_lda (1/(war.shape[0])) * (conf_lda.iloc[1,0] + conf_lda.iloc[0,1]) """ 6.69% """ qda1 = QDA(store_covariances=True) qda1.fit(warX,warY) test = qda1.predict_proba(warX) my_qda_pred = pd.DataFrame() my_qda_pred["pred"] = ["No" if x < .5 else "Yes" for x in qda1.predict(warX)] my_qda_pred["actual"] = ["No" if x == 0 else "Yes" for x in war["start"]] conf_qda = pd.crosstab(my_qda_pred["pred"], my_qda_pred["actual"]) conf_qda (1/(war.shape[0])) * (conf_qda.iloc[1,0] + conf_qda.iloc[0,1])
for i in range(0,9): labels.append(1) for i in range(9,18): labels.append(2) for i in range(18, 27): labels.append(3) ''' # Creation of random labels for i in range(0,27): labels.append(int(random.random() * 3) + 1) print (labels) ''' # QDA model qda = QuadraticDiscriminantAnalysis() qda.fit(comps, labels) # MCC Calculation y_pred = qda.predict(comps) #print(labels) #print(y_pred) mcc = multimcc(labels,y_pred) print("MCC="+str(mcc)) ''' # Plotting QDA contour nx, ny = 200, 100 x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0]) y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny)) Z = qda.predict_proba(np.c_[xx.ravel(), yy.ravel()])
# In[4]: # CHALLENGE - ...and train them on our data # Training the models clf_tree.fit(X, Y) clf_svm.fit(X, Y) clf_perceptron.fit(X, Y) clf_KNN.fit(X, Y) clf_svm_RBF.fit(X, Y) clf_GPC.fit(X, Y) clf_RFC.fit(X, Y) clf_NN.fit(X, Y) clf_ADA.fit(X, Y) clf_GNB.fit(X, Y) clf_QDA.fit(X, Y) #prediction = clf.predict([[190, 70, 43]]) # In[5]: # Testing using the same data pred_tree = clf_tree.predict(X) acc_tree = accuracy_score(Y, pred_tree) * 100 print('Accuracy for DecisionTree: {}'.format(acc_tree)) # In[6]: # Linear SVM pred_svm = clf_svm.predict(X) acc_svm = accuracy_score(Y, pred_svm) * 100
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #smoteen sme = SMOTEENN(random_state=42) os_X,os_y = sme.fit_sample(X_train,y_train) #QDA clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True) clf_QDA.fit(os_X, os_y) y_true, y_pred = y_test, clf_QDA.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity)
def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariance_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariance_[1], 'blue') plt.figure(figsize=(10, 8), facecolor='white') for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariance=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis', y=1.02, fontsize=15) plt.tight_layout() plt.show()
up_probs[idx_g50].size) print('\nNumber of days with probability < 50% for market to be up:', up_probs[idx_l50].size) #Indices of events with posterior probability > 90%# idx_g90 = up_probs > 0.9 print('\nNumber of days with probability > 90% for market to be up:', up_probs[idx_g90].size) ### QUADRATIC DISCRIMINANT ANALYSIS ### # Initiate QDA object qda_clf = QuadraticDiscriminantAnalysis() # Fit model. Let Xs_train = matrix of new_pred, Ys_train = matrix of variables. resqda_clf = qda_clf.fit(Xs_train, Ys_train) #Predicted values for training set Ys_pred_qda = resqda_clf.predict(Xs_test) #Prior probabilities# print("\nPrior probabilities") print(resqda_clf.classes_) print(resqda_clf.priors_) #Group means# print("\nGroup means") #print(resqda_clf.classes_) print(resqda_clf.means_) #Confusion matrix#
def quadratic_discriminant_fn(x_train, y_train): model = QuadraticDiscriminantAnalysis() model.fit(x_train, y_train) return model
import numpy as np from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # 임의의 X, y값 지정. X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) # LDA모델 구축. clf = LinearDiscriminantAnalysis() clf.fit(X, y) # predict. print(clf.predict([[-0.8, -1]])) # QDA를 이용한 예측. from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf2 = QuadraticDiscriminantAnalysis() clf2.fit(X, y) # confusion matrix를 통해 LDA와 QDA 비교. from sklearn.metrics import confusion_matrix y_pred = clf.predict(X) confusion_matrix(y, y_pred) y_pred2 = clf2.predict(X) confusion_matrix(y, y_pred2)
############################################## # Now apply the transformations to the data: X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) print 'Trasformazione applicata ai dati correttamente' ############################################### # Next we create an instance of the model clf = QuadraticDiscriminantAnalysis() print 'QDA creato correttamente' ############################################### print 'Fit dei dati in corso' # Fit the training data to our model clf.fit(X_train, y_train) print 'Train eseguito con successo' #print mlp ################################################## predictions = clf.predict(X_test) #print predictions ################################################## print(confusion_matrix(y_test, predictions)) print(classification_report(y_test, predictions, digits=4)) #################################################### #print len(mlp.coefs_) #print len(mlp.coefs_[0])