def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
class SNPForecastingStrategy(Strategy): def __init__(self,symbol,bars): self.symbol=symbol self.bars=bars self.create_periods() self.fit_model() def create_periods(self): self.start_train=datetime.datetime(2001,1,10) self.start_test=datetime.datetime(2005,1,1) self.end_period=datetime.datetime(2005,12,31) def fit_model(self): snpret=create_lagged_series(self.symbol,self.start_train,self.end_period,lags=5) X=snpret[['Lag1','Lag2']] Y=snpret['Direction'] X_train=X[X.index<self.start_test] Y_train=Y[Y.index<self.start_test] self.predictors=X[X.index>=self.start_test] self.model=QDA() self.model.fit(X_train,Y_train) def generate_signals(self): signals=pd.DataFrame(index=self.bars.index) signals['signal']=0.0 signals['signal']=self.model.predict(self.predictors) signals['signal'][0:5]=0.0 signals['positions']=signals['signal'].diff() return signals
class RegularizedQDA: """ Three types of regularization are possible: - regularized the covariance of a class toward the average variance within that class - regularize the covariance of a class toward the pooled covariance across all classes - add some constant amount of variance to each feature """ def __init__(self, avg_weight = 0.1, pooled_weight = 0, extra_variance = 0): self.avg_weight = avg_weight self.pooled_weight = pooled_weight self.extra_variance = extra_variance self.model = QDA() def fit(self, X, Y): self.model.fit(X,Y) I = np.eye(X.shape[1]) a = self.avg_weight p = self.pooled_weight ev = self.extra_variance original_weight = 1.0 - a - p scaled_pooled_cov = p * np.cov(X.T) assert scaled_pooled_cov.shape == I.shape assert all([C.shape == I.shape for C in self.model.rotations]) self.model.rotations = \ [original_weight * C + \ a * np.mean(np.diag(C)) * I + \ scaled_pooled_cov + ev * I \ for C in self.model.rotations] def predict(self, X): return self.model.predict(X)
class SNPForecastingStrategy(Strategy): """ Requires: symbol - A stock symbol on which to form a strategy on. bars - A DataFrame of bars for the above symbol.""" def __init__(self, symbol, bars): self.symbol = symbol self.bars = bars self.create_periods() self.fit_model() def create_periods(self): """Create training/test periods.""" self.start_train = datetime.datetime(2001,1,10) self.start_test = datetime.datetime(2005,1,1) self.end_period = datetime.datetime(2005,12,31) def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QDA() self.model.fit(X_train, y_train) def generate_signals(self): """Returns the DataFrame of symbols containing the signals to go long, short or hold (1, -1 or 0).""" signals = pd.DataFrame(index=self.bars.index) signals['signal'] = 0.0 # Predict the subsequent period with the QDA model signals['signal'] = self.model.predict(self.predictors) # Remove the first five signal entries to eliminate # NaN issues with the signals DataFrame signals['signal'][0:5] = 0.0 signals['positions'] = signals['signal'].diff() return signals
def performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): """ SVM binary classification """ clf = QDA() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) return accuracy
def performQDAClass(X_train, y_train, X_test, y_test): """ Gradient Tree Boosting binary Classification """ clf = QDA() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) #auc = roc_auc_score(y_test, clf.predict(X_test)) return accuracy
def qda(data,labels,n,v_type): train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type) clf = QDA() clf.fit(train_data, train_labels) y_pred = clf.predict(test_data) pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels)) report = classification_report(y_pred, test_labels, target_names=rock_names) cm = confusion_matrix(test_labels, y_pred) return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"QDA"
def QDA(self,membership,group_labels=None,std=3,ellipses=True,dpi=300,fontsize=10,MD=False, legend=False, numbered=False,of='pdf'): self.type = 'QDA' membership = membership.astype(int) qda = QDA() self.fit = qda.fit(self.data, membership).predict(self.data) if ellipses: self.getEllipses(std,membership) self.PlotXDA(membership,group_labels=group_labels,std=std,ellipses=ellipses,dpi=dpi, fontsize=fontsize,MD=MD,legend=legend,numbered=numbered,of=of) self.Store()
def qda_predict(train_data, test_data, train_cat, xx, yy): # QDA CLASSIFIER qda_classifier = QDA() qda_fit = qda_classifier.fit(train_data, train_cat) predicted = qda_fit.predict(test_data) contour = qda_fit.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] contour = contour.reshape(xx.shape) return predicted, contour
def get_QDA(Xtrain, Xtest, Ytrain, Ytest): qda = QDA() qda.fit(Xtrain,Ytrain) # predLabels = qda.predict(Xtest) # print("Classification Rate Test QDA: " + str(np.mean(Ytest==predLabels)*100) + " %") scores = np.empty((4)) scores[0] = qda.score(Xtrain,Ytrain) scores[1] = qda.score(Xtest,Ytest) print('QDA, train: {0:.02f}% '.format(scores[0]*100)) print('QDA, test: {0:.02f}% '.format(scores[1]*100)) return qda
def QuadraticDiscriminantAnalysis(x_train, y_train, x_cv, y_cv): """ Quadratic Discriminant Analysis Classifier """ print "Quadratic Discriminant Analysis" clfr = QDA() clfr.fit(x_train, y_train) #print 'Accuracy in training set: %f' % clfr.score(x_train, y_train) #if y_cv != None: #print 'Accuracy in cv set: %f' % clfr.score(x_cv, y_cv) return clfr
def train_qda(X, y, priors=None, reg_param=0.0): """ Builds a quadratic discriminant analysis model Returns: clf: Fitted QDA model """ clf = QDA(priors=priors, reg_param=reg_param) clf = clf.fit(X,y) print 'Quadratic Discriminant Analysis completed!' return clf
def train_classifier(xTrain_s, yTrain_s, kwargs): """ Train a naive baise classifier on xTrain and yTrain and return the trained classifier """ if type(xTrain_s) != list: classifier_s = QDA(**kwargs) classifier_s.fit(xTrain_s, yTrain_s) else: classifier_s = train_classifier_8(xTrain_s, yTrain_s, kwargs) return classifier_s
def fit_model(self): """Fits a Quadratic Discriminat Analyser to the US sock market index (^GPSC in Yahoo).""" # Create a laggged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor value, with direction as the response X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the prediciting factors for use # in direction forecasting. self.predictors = X[X.index >= self.start_test] # Create the Quadractic Discriminant Analysis model # and the forcasting strategy self.model = QDA() self.model.fit(X_train, y_train)
def QDA_onFullDataset(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Fitting data using QDA classifier clf = QDA() clf.fit(XFull, YFull.flatten()) #Testing the results precision,recall,fscore = common.checkAccuracy(clf.predict(XFullTest),YFullTest,[1,2,3,4,5,6]) print fscore
def train(self, classification_data, indices=None, settings_name=None, **kwargs): super(QDAClassifier, self).train(classification_data, indices, settings_name, **kwargs) indices = self.settings['indices'] self.qda = QDA(**self.classifier_kwargs) self.qda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual) return self
def fit_model(self): snpret=create_lagged_series(self.symbol,self.start_train,self.end_period,lags=5) X=snpret[['Lag1','Lag2']] Y=snpret['Direction'] X_train=X[X.index<self.start_test] Y_train=Y[Y.index<self.start_test] self.predictors=X[X.index>=self.start_test] self.model=QDA() self.model.fit(X_train,Y_train)
def runQDA(fileNamaParam, trainizingSizeParam): # what percent will you use ? testSplitSize = 1.0 - trainizingSizeParam testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] ### classification ## get the test and training sets featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(trainData, testData, test_size=testSplitSize, random_state=0) ## fire up the model theQDAModel = QDA() theQDAModel.fit(featureSpace_train, vScore_train) thePredictedScores = theQDAModel.predict(featureSpace_test) #print "The original vector: " #print vScore_test #print "The predicted score vector: " #print thePredictedScores evalClassifier(vScore_test, thePredictedScores)
def create_symbol_forecast_model(self): # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5) # Use the prior two days of returns as predictor # values, with direction as the response X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Create training and test sets start_test = self.model_start_test_date X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] model = QDA() model.fit(X_train, y_train) return model
def QDAResult3D(): norTrainNum, nor_isTraining = randTestData(t_data_perc, norDataNum) cnTrainNum, cn_isTraining = randTestData(t_data_perc, cnDataNum) isTraining =np.hstack((nor_isTraining, cn_isTraining)) #Training QDA classifier clf = QDA() trained_clf = clf.fit(train_data[isTraining], labels[isTraining]) #Using the remaining data for testing normal_pred = trained_clf.predict(normal_pt[nor_isTraining == False]) trueneg_n = (normal_pred == 0).sum() specificity = trueneg_n/int(norDataNum - norTrainNum) cancer_pred = trained_clf.predict(cancer_pt[cn_isTraining == False]) truepos_n = (cancer_pred == 1).sum() sensitivity = truepos_n/int(cnDataNum - cnTrainNum) return sensitivity, specificity
def qda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans qda split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=QDA() lda.fit(X_train,y_train) y_pred = lda.predict(X_test) print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"QDA_metrics_test.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "QDA %f"%test_size save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda split_test")
def create_symbol_forecast_model(self): # Create a lagged series of the market index snpret = create_lagged_series( self.symbol_list[0], self.model_start_date, self.model_end_date, lags = 5 ) # Use the prior X days of returns as predictor values with direction # as the response. X = snpret[['Lag1','Lag2']] y = snpret["Direction"] # Create training and test sets start_test = self.model_start_test_date X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] #model to use is Quadratic Discriminant Analysis model = QDA() model.fit(X_train, y_train) return model
def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): """ Quadratic Discriminant Analysis binary Classification """ def replaceTiny(x): if (abs(x) < 0.0001): x = 0.0001 X_train = X_train.apply(replaceTiny) X_test = X_test.apply(replaceTiny) clf = QDA() clf.fit(X_train, y_train) if savemodel == True: fname_out = '{}-{}.pickle'.format(fout, datetime.now()) with open(fname_out, 'wb') as f: cPickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) return accuracy
def table_4_1(): """Reproduces table 4.1 in ESLii showing the training and test error rates for classifying vowels using different classification techniques. The sklearn implementation of logistic regression uses OvA instead of a true multinomial which likely accounts for the worse results """ vowels_train = eslii.read_vowel_data() train_X = vowels_train[vowels_train.columns[1:]] train_y = vowels_train['y'] vowels_test = eslii.read_vowel_data(train=False) test_X = vowels_test[vowels_test.columns[1:]] test_y = vowels_test['y'] lda = LDA().fit(train_X, train_y) print "Linear discriminant analysis: {:.2f} {:.2f}".format( 1 - lda.score(train_X, train_y), 1 - lda.score(test_X, test_y)) qda = QDA().fit(train_X, train_y) print "Quadratic discriminant analysis: {:.2f} {:.2f}".format( 1 - qda.score(train_X, train_y), 1 - qda.score(test_X, test_y)) lr = LogisticRegression(C=1e30).fit(train_X, train_y) print "Logistic regression: {:.2f} {:.2f}".format( 1 - lr.score(train_X, train_y), 1 - lr.score(test_X, test_y))
class QDAClassifier(Classifier): '''Quadratic Discriminant analysis classifier''' def __init__(self): super(QDAClassifier, self).__init__() self.fig = 20 self.is_trainable = True self.is_trained = False def train(self, classification_data, indices=None, settings_name=None, **kwargs): super(QDAClassifier, self).train(classification_data, indices, settings_name, **kwargs) indices = self.settings['indices'] self.qda = QDA(**self.classifier_kwargs) self.qda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual) return self def classify(self, classification_data): super(QDAClassifier, self).classify(classification_data) indices = self.settings['indices'] self.are_hurr_pred = self.qda.predict(classification_data.data[:, indices]) return self.are_hurr_pred
def QDA_onNonDynamicData(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Getting the dataset associated with Non-Dynamic Activities on training X_NonDynamic,Y_NonDynamic = common.getDataSubset(XFull,YFull.flatten(),[4,5,6]) #Getting the dataset associated with Non-Dynamic Activities on testing X_NonDynamicTest,Y_NonDynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[4,5,6]) #Fitting data using QDA classifier clf = QDA() clf.fit(X_NonDynamic, Y_NonDynamic.flatten()) precision,recall,fscore = common.checkAccuracy(clf.predict(X_NonDynamicTest),Y_NonDynamicTest,[4,5,6]) common.createConfusionMatrix(clf.predict(X_NonDynamicTest).flatten(),Y_NonDynamicTest.flatten(),[4,5,6]) print fscore #Getting the dataset associated with Dynamic Activities on training X_Dynamic,Y_Dynamic = common.getDataSubset(XFull,YFull.flatten(),[1,2,3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest,Y_DynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[1,2,3]) print len(X_DynamicTest),len(Y_DynamicTest) #Fitting data using QDA classifier clf = QDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision,recall,fscore = common.checkAccuracy(clf.predict(X_DynamicTest),Y_DynamicTest,[1,2,3]) common.createConfusionMatrix(clf.predict(X_DynamicTest).flatten(),Y_DynamicTest.flatten(),[1,2,3]) print fscore
def checkeachClassfier(train_x, train_y, test_x, test_y): classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(class_weight='auto'), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(class_weight='auto'), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), RandomForestClassifier(class_weight='auto'), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] classtitle = [ "KNeighborsClassifier", "SVC", "SVC weighted", "SVC(gamma=2, C=1)", "DecisionTreeClassifier", "DecisionTreeClassifier weighted", "RandomForestClassifier", "RandomForestClassifier weighted", "AdaBoostClassifier", "GaussianNB", "LDA", "QDA" ] for i in range(len(classtitle)): try: ctitle = classtitle[i] clf = classifiers[i] clf.fit(train_x, train_y) train_pdt = clf.predict(train_x) MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt) print ctitle + ":" print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) test_pdt = clf.predict(test_x) MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) fn = "submission_%s.csv" % ctitle fout = open(fn, 'w') fout.write("ID,target\n") for index, eachline in enumerate(test_pdt): fout.write("%s,%s\n" % (str(int(test_x[index][0])), str(test_pdt[index]))) fout.close() except: print ctitle + ": error" print
def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QDA() self.model.fit(X_train, y_train)
def get_QDA(Xtrain, Xtest, Ytrain, Ytest): qda = QDA() qda.fit(Xtrain, Ytrain) # predLabels = qda.predict(Xtest) # print("Classification Rate Test QDA: " + str(np.mean(Ytest==predLabels)*100) + " %") scores = np.empty((4)) scores[0] = qda.score(Xtrain, Ytrain) scores[1] = qda.score(Xtest, Ytest) print('QDA, train: {0:.02f}% '.format(scores[0] * 100)) print('QDA, test: {0:.02f}% '.format(scores[1] * 100)) return qda
def get_LDA_performance(test_df, X_std, y): X_test = test_df.ix[:, 'x.1':'x.10'].values X_std_test = StandardScaler().fit_transform(X_test) y_test = test_df.ix[:, 'y'].values lda_scores_training = [] lda_scores_test = [] qda_scores_training = [] qda_scores_test = [] knn_scores_training = [] knn_scores_test = [] for d in range(1, 11): lda = LDA(n_components=d) Xred_lda_training = lda.fit_transform(X_std, y) Xred_lda_test = lda.transform(X_std_test) lda_model = LDA() lda_model.fit(Xred_lda_training, y) qda_model = QDA() qda_model.fit(Xred_lda_training, y) knn_model = KNeighborsClassifier(n_neighbors=10) knn_model.fit(Xred_lda_training, y) lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y)) lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test)) qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y)) qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test)) knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y)) knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test)) plt.plot(range(10), lda_scores_training, 'r--', label="Train data") plt.plot(range(10), lda_scores_test, 'b--', label="Test data") plt.title("LDA vs LDA") plt.xlabel('k') plt.ylabel('Score') plt.show() plt.plot(range(10), qda_scores_training, 'r--', label="Train data") plt.plot(range(10), qda_scores_test, 'b--', label="Test data") plt.title("QDA vs LDA") plt.show() plt.plot(range(10), knn_scores_training, 'r--', label="Train data") plt.plot(range(10), knn_scores_test, 'b--', label="Test data") plt.title("KNN vs LDA") plt.show()
def classifier_comparison(X, y): """ 分类器比较 Args: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] Returns: None """ from sklearn import grid_search from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA import scipy # Exhaustive Grid Search exhaustive_parameters = {'kernel':['rbf'], 'C':[1, 10, 100, 1000], 'gamma':[1e-3, 1e-4]} clf_SVC_exhaustive = grid_search.GridSearchCV(SVC(), exhaustive_parameters) # Randomized Parameter Optimization randomized_parameter = {'kernel':['rbf'], 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1)} clf_SVC_randomized = grid_search.RandomizedSearchCV(SVC(), randomized_parameter) names = ["Linear SVM", "RBF SVM", "RBF SVM with Grid Search", "RBF SVM with Random Grid Search", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), clf_SVC_exhaustive, clf_SVC_randomized, DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] for name, clf in zip(names, classifiers): logger.info('Use %s:' % (name)) train_classifier(clf, X, y)
def supervised_classification( self, input, label, classification_method='RandomForestClassifier'): assert classification_method in set([ 'KNeighborsClassifier', 'SVC', 'DecisionTreeClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'GaussianNB', 'LDA', 'QDA' ]) # Generate the clasifier: if classification_method == 'KNeighborsClassifier': from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=10) elif classification_method == 'SVC': from sklearn.svm import SVC classifier = SVC(gamma=2, C=1) elif classification_method == 'DecisionTreeClassifier': from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(max_depth=5) elif classification_method == 'AdaBoostClassifier': from sklearn.ensemble import AdaBoostClassifier classifier = AdaBoostClassifier() elif classification_method == 'GaussianNB': from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() elif classification_method == 'LDA': from sklearn.lda import LDA classifier = LDA() elif classification_method == 'QDA': from sklearn.qda import QDA classifier = QDA() else: from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) # Train classifier classifier.fit(input, label) return classifier
def checkeachClassfier(train_x, train_y, test_x, test_y): classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(class_weight='auto'), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(class_weight='auto'), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), RandomForestClassifier(class_weight='auto'), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] classtitle = [ "KNeighborsClassifier", "SVC", "SVC weighted", "SVC(gamma=2, C=1)", "DecisionTreeClassifier", "DecisionTreeClassifier weighted", "RandomForestClassifier", "RandomForestClassifier weighted", "AdaBoostClassifier", "GaussianNB", "LDA", "QDA" ] for i in range(len(classtitle)): try: ctitle = classtitle[i] clf = classifiers[i] clf.fit(train_x, train_y) train_pdt = clf.predict(train_x) MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt) print ctitle + ":" print "MCC, Acc_p , Acc_n, Acc_all(train): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) test_pdt = clf.predict(test_x) MCC, Acc_p, Acc_n, Acc_all = get_Accs(test_y, test_pdt) print "MCC, Acc_p , Acc_n, Acc_all(test): " print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all)) except: print ctitle + ": error" print
def main(): (X, Y, Ynames) = load_magic_data() X = StandardScaler().fit_transform(X) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33, random_state=None) C = 5.0 classifiers = { 'L1 logistic': LogisticRegression(C=C, penalty='l1'), 'L2 logistic': LogisticRegression(C=C, penalty='l2'), 'KNN': KNeighborsClassifier(n_neighbors=11), 'NB': GaussianNB(), 'RF5': RandomForestClassifier(n_estimators=5), 'RF50': RandomForestClassifier(n_estimators=50), 'AdaBoost': AdaBoostClassifier(), 'LDA': LDA(), 'QDA': QDA() } plt.figure(figsize=(8, 8)) n_classifiers = len(classifiers) for index, (name, clf) in enumerate(classifiers.iteritems()): clf.fit(Xtrain, Ytrain) probs = clf.predict_proba(Xtest) fpr, tpr, thresholds = roc_curve(Ytest, probs[:, 1]) roc_auc = auc(fpr, tpr) print 'For model', name, 'accuracy =', clf.score(Xtest, Ytest) plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show()
def evaluate(data, targets): print "Creating models..." models = [] models.append(LinearSVC()) models.append(SVC(kernel='rbf')) models.append(GaussianNB()) models.append(LDA()) models.append(QDA()) models.append(LogisticRegression()) models.append(KNeighborsRegressor()) models.append( RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=1234, n_jobs=-1)) if sparse.issparse(data): data = data.toarray() mc = ModelComparison(data, targets, folds=10, numCV=3, models=models) mc.evaluate()
def evaluate(data, targets): prior = numpy.bincount(y.astype(int)) / float(len(targets)) models = [ LDA(priors=prior), SVC(probability=True, class_weight="auto", kernel="linear"), LogisticRegression(class_weight="auto"), GaussianNB(), KNeighborsClassifier(), QDA(priors=prior), RandomForestClassifier(n_estimators=100, criterion="entropy", n_jobs=-1, random_state=123456), SVC(probability=True, class_weight="auto") ] model_names = [ "LDA", "Linear SVM", "Logistic Regression", "Naive Bayes", "k-NN", "QDA", "Random Forest", "SVM w/ RBF" ] # evaluate using ModelEvaluation class mevaluator = model_evaluation.TenFoldCrossValidation( data=data, targets=targets, models=models, model_names=model_names, scale=True) start = time.time() caa_eval = mevaluator.evaluate(metrics.class_averaged_accuracy_score) for key, value in caa_eval.iteritems(): model_str = key.split("(")[0] print model_str, (str(numpy.around(numpy.mean(value), decimals=3)) + " (" + str(numpy.around(numpy.std(value), decimals=3)) + ")") mevaluator.evaluate_roc() print "Overall running time:", (time.time() - start)
def test_feature_splitter(size=2000): X, y = commonutils.generate_sample(size, 10, distance=0.5) X['column0'] = numpy.clip( numpy.array(X['column0']).astype(numpy.int), -2, 2) trainX, testX, trainY, testY = commonutils.train_test_split(X, y) base_estimators = {'rf': RandomForestClassifier()} splitter = FeatureSplitter('column0', base_estimators=base_estimators, final_estimator=RandomForestClassifier()) splitter.fit(trainX, trainY) print(splitter.score(testX, testY)) print(RandomForestClassifier().fit(trainX, trainY).score(testX, testY)) print( DumbSplitter('column0', base_estimator=RandomForestClassifier()).fit( trainX, trainY).score(testX, testY)) chain = OrderedDict() chain['QDA'] = QDA() chain['LDA'] = LDA() chain['RF'] = RandomForestClassifier() print(ChainClassifiers(chain).fit(trainX, trainY).score(testX, testY)) print(LDA().fit(trainX, trainY).score(testX, testY))
def __init__(self, training_path, testing_path): self.training_path = training_path self.testing_path = testing_path self.training_features = None self.testing_features = None self.training_image_list = [] self.testing_image_list = [] self.training_labels = [] self.testing_labels = [] self.predicted_testing_labels = [] self.class_map = {} self.n_classes = len(os.listdir(os.path.join('.', 'data', 'training'))) self.classifiers = { 'knn': KNeighborsClassifier(3), 'svm_linear': SVC(kernel="linear", C=0.025), 'svm': SVC(gamma=2, C=1), 'tree': DecisionTreeClassifier(max_depth=5), 'rf': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'adb': AdaBoostClassifier(), 'gauss': GaussianNB(), 'lda': LDA(), 'qda': QDA(), 'ann': neuralNetwork(self.n_classes) } self.get_training_image_list() self.get_testing_image_list()
def random_methods(data_train1,target_train1): rng = np.random.RandomState(96235) names = ["SGD", "Nearest Neighbors", "ensembel","Decision Tree","Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SGDClassifier(loss='hinge', penalty='l2', alpha=0.0005, n_iter=200, random_state=42, n_jobs=-1, average=True), KNeighborsClassifier(10), AdaBoostRegressor(DecisionTreeRegressor(max_depth=25),n_estimators=300, random_state=rng), DecisionTreeClassifier(max_depth=11), RandomForestClassifier(max_depth=21, n_estimators=21, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] # iterate over classifiers for name, clf in zip(names, classifiers): print("Fitting " + name + "...") clf.fit(data_train1, target_train1) print("Predicting...") score = clf.score(data_test, target_test) print(score) predicted_test = clf.fit(data_train1, target_train1).predict(data_test) print(metrics.classification_report(target_test, predicted_test))
def BuildModel(self, data, labels): # Create and train the classifier. qda = SQDA() qda.fit(data, labels) return qda
# Filter out all the warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=FutureWarning) # All the ML algorithms we're going to try algorithms = [ (RandomForestClassifier(max_depth=5, n_jobs=-1, n_estimators=10, max_features=10), "Random Forest"), (GaussianNB(), "Gaussian Naive Bayes"), (LogisticRegression(), "Logistic Regression"), (LinearSVC(), "Support Vector Machine"), (DecisionTreeClassifier(max_depth=10), "Decision Tree"), (QDA(), "QDA"), (GradientBoostingClassifier(), "BOOSTING!!!"), (Pipeline(steps=[('rbm', BernoulliRBM()), ('logistic', LogisticRegression())]), "Bernoulli Neural Network Combo Logit") ] # Create dataset of (X, y) where X is a list of (answer, question), y is labels. # In this case, y is the label of whether the answer is correct for the question def generateDataset(datafile): generatedFile = "../data/cayman_distance_data/distanceDataset.pickle" # No need to generate twice. if (isfile(generatedFile)): return loadPickle(generatedFile)
from varplot import * from sklearn.qda import QDA import numpy as np import pickle data = np.load("sd.npy") truth = np.load("truth.npy") testdata = np.load("sd_test.npy") testtruth = np.load("truth_test.npy") print(len(data)) clf = QDA() clf.fit(data,truth) output=open("qda.pkl",'wb') pickle.dump(clf,output) output.close() print(clf.score(data,truth)) print(clf.score(testdata,testtruth)) s = np.where(truth == 2)[0] st = np.where(testtruth == 2)[0] g = np.where(truth == 1)[0] gt = np.where(testtruth == 1)[0] print("Stars") print(clf.score(data[s],truth[s]))
class Classifier(BiPlot): ''' To hold methods and data to support classification of measurements in a STOQS database. See http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html ''' classifiers = { 'Nearest_Neighbors': KNeighborsClassifier(3), 'Linear_SVM': SVC(kernel="linear", C=0.025), 'RBF_SVM': SVC(gamma=2, C=1), 'Decision_Tree': DecisionTreeClassifier(max_depth=5), 'Random_Forest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'AdaBoost': AdaBoostClassifier(), 'Naive_Bayes': GaussianNB(), 'LDA': LDA(), 'QDA': QDA() } def getActivity(self, mpx, mpy): ''' Return activity object which MeasuredParameters mpx and mpy belong to ''' meas = Measurement.objects.using(self.args.database).filter( measuredparameter__id__in=(mpx, mpy)).distinct() acts = Activity.objects.using(self.args.database).filter( instantpoint__measurement__measuredparameter__id__in=( mpx, mpy)).distinct() if not acts: print "acts = %s" % acts raise Exception('Not exactly 1 activity returned with SQL = \n%s' % str(acts.query)) else: return acts[0] def saveCommand(self, doOption): ''' Save the command executed to a Resource and return it for the doXxxx() method to associate it with the resources it creates ''' rt, created = ResourceType.objects.using( self.args.database).get_or_create(name=LABEL, description='metadata') r, created = Resource.objects.using(self.args.database).get_or_create( name=COMMANDLINE, value=self.commandline, resourcetype=rt) return r def saveLabelSet(self, clResource, label, x_ids, y_ids, description, typeName, typeDescription): ''' Save the set of labels in MeasuredParameterResource. Accepts 2 input vectors. (TODO: generalize to N input vectors); description is used to describe the criteria for assigning this label. The typeName and typeDecription may be used to refer to the grouping, and associate via the grouping the other labels made in the heuristic applied. ''' try: # Label rt, created = ResourceType.objects.using( self.args.database).get_or_create(name=typeName, description=typeDescription) r, created = Resource.objects.using( self.args.database).get_or_create(name=LABEL, value=label, resourcetype=rt) # Label's description rdt, created = ResourceType.objects.using( self.args.database).get_or_create(name=LABEL, description='metadata') rd, created = Resource.objects.using( self.args.database).get_or_create(name=DESCRIPTION, value=description, resourcetype=rdt) rr = ResourceResource(fromresource=r, toresource=rd) rr.save(using=self.args.database) # Associate with commandlineResource ResourceResource.objects.using(self.args.database).get_or_create( fromresource=r, toresource=clResource) except IntegrityError as e: print e print "Ignoring" # Associate MeasuredParameters with Resource if self.args.verbose: print " Saving %d values of '%s' with type '%s'" % ( len(x_ids), label, typeName) for x_id, y_id in zip(x_ids, y_ids): a = self.getActivity(x_id, y_id) mp_x = MeasuredParameter.objects.using( self.args.database).get(pk=x_id) mp_y = MeasuredParameter.objects.using( self.args.database).get(pk=y_id) mpr_x, created = MeasuredParameterResource.objects.using( self.args.database).get_or_create(activity=a, measuredparameter=mp_x, resource=r) mpr_y, created = MeasuredParameterResource.objects.using( self.args.database).get_or_create(activity=a, measuredparameter=mp_y, resource=r) def removeLabels(self, labeledGroupName, label=None, description=None, commandline=None): ''' Delete labeled MeasuredParameterResources that have ResourceType.name=labeledGroupName (such as 'Labeled Plankton'). Restrict deletion to the other passed in options, if specified: label is like 'diatom', description is like 'Using Platform dorado, Parameter {'salinity': ('33.65', '33.70')} from 20130916T124035 to 20130919T233905' (commandline is too long to show in this doc string - see examples in usage note). Note: Some metadatda ResourceTypes will not be removed even though the Resources that use them will be removed. ''' # Remove MeasuredParameter associations with Resource (Labeled data) mprs = MeasuredParameterResource.objects.using( self.args.database).filter( resource__resourcetype__name=labeledGroupName).select_related( depth=1) if label: mprs = mprs.filter(resource__name=LABEL, resource__value=label) if self.args.verbose > 1: print " Removing MeasuredParameterResources with type = '%s' and label = %s" % ( labeledGroupName, label) rs = [] for mpr in mprs: rs.append(mpr.resource) mpr.delete(using=self.args.database) # Remove Resource associations with Resource (label metadata), make rs list distinct with set() before iterating on the delete() if label and description and commandline: rrs = ResourceResource.objects.using(self.args.database).filter( (Q(fromresource__name=LABEL) & Q(fromresource__value=label)) & ((Q(toresource__name=DESCRIPTION) & Q(toresource__value=description)) | (Q(toresource__name=COMMANDLINE) & Q(toresource__value=commandline)))) if self.args.verbose > 1: print " Removing ResourceResources with fromresource__value = '%s' and toresource__value = '%s'" % ( label, description) for rr in rrs: rr.delete(using=self.args.database) else: if self.args.verbose > 1: print " Removing Resources associated with labeledGroupName = %s'" % labeledGroupName for r in set(rs): r.delete(using=self.args.database) def createLabels(self, labeledGroupName): ''' Using discriminator, mins, and maxes label MeasuredParameters in the database so that we can do supervised learning ''' sdt = datetime.strptime(self.args.start, '%Y%m%dT%H%M%S') edt = datetime.strptime(self.args.end, '%Y%m%dT%H%M%S') commandlineResource = self.saveCommand('createLabels') for label, min, max in zip(self.args.labels, self.args.mins, self.args.maxes): # Multiple discriminators are possible... pvDict = {self.args.discriminator: (min, max)} if self.args.verbose: print "Making label '%s' with discriminator %s" % (label, pvDict) try: x_ids, y_ids, xx, yy, points = self._getPPData( sdt, edt, self.args.platform, self.args.inputs[0], self.args.inputs[1], pvDict, returnIDs=True, sampleFlag=False) except NoPPDataException, e: print e if self.args.verbose: print " (%d, %d) MeasuredParameters returned from database %s" % ( len(x_ids), len(y_ids), self.args.database) description = 'Using Platform %s, Parameter %s from %s to %s' % ( self.args.platform, pvDict, self.args.start, self.args.end) if self.args.clobber: self.removeLabels(labeledGroupName, label, description, commandlineResource.value) self.saveLabelSet( commandlineResource, label, x_ids, y_ids, description, labeledGroupName, 'Labeled with %s as discriminator' % self.args.discriminator)
def cal_val_analysis(cols=None): '''Calibrates/validates classifiers from 1990 to 2009 Calibrates on even years, validates on odd Trains all variants of the scikit.learn SGD classifier, as well as LDA/QDA. SGD pre-scales data Calc's FP/FN/TP/TN + sens, ppv and sens*ppv. ''' # Load the data. if cols == None: cols = COLS cal_years = range(1990, 2010, 2) val_years = range(1991, 2010, 2) cal_cfm = get_results(cal_years, settings.RESULTS) val_cfm = get_results(val_years, settings.RESULTS) # Set up classifers. classifiers = [] scalers = [] for sgd_loss in SGD_LOSSES: for sgd_penalty in SGD_PENALTY: sgd = SGDClassifier(loss=sgd_loss, penalty=sgd_penalty) sgd_scaler = StandardScaler() classifiers.append(sgd) scalers.append(sgd_scaler) classifiers.append(LDA()) scalers.append(None) classifiers.append(QDA()) scalers.append(None) # Perform classification. for classifier, scaler in zip(classifiers, scalers): print('Analysing with classifier {}'.format(classifier)) try: for cfm, is_cal in [(cal_cfm, True), (val_cfm, False)]: if is_cal: print('CAL') else: print('VAL') data = cfm[cols].values.astype(np.float32) are_hurr = ~cfm.bt_wind.isnull() & (cfm.is_hurr) if is_cal: if scaler is not None: scaler.fit(data) scaled_data = scaler.transform(data) else: scaled_data = data fit(classifier, scaled_data, are_hurr) else: if scaler is not None: scaled_data = scaler.transform(data) else: scaled_data = data print(', '.join(cols)) predict(classifier, scaled_data, are_hurr) print('') except Exception, e: print('Error with classifier {}'.format(classifier)) print(e)
import numpy as np import pickle import os from preprocessing import * classifiers = { 'knn': KNeighborsClassifier(3), 'svm_linear': SVC(kernel="linear", C=0.025), 'svm': SVC(gamma=2, C=1), 'tree': DecisionTreeClassifier(max_depth=5), 'rf': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'adb': AdaBoostClassifier(), 'etc': ExtraTreesClassifier(), 'gauss': GaussianNB(), 'lda': LDA(), 'qda': QDA(), # 'ann': neuralNetwork( 16 ) } def feature_selection(training_data, target_data, test_data): X1 = np.array(training_data).astype(np.float) y = np.array(target_data).astype(np.float) X1_test = np.array(test_data).astype(np.float) features = training_data.columns print features X_index = np.arange(X1.shape[-1]) ''' Variance Threshold ''' sel = VarianceThreshold(threshold=(.8 * (1 - .8))) X1 = sel.fit_transform(X1) X1_test = sel.transform(X1_test)
__author__ = 'chaouki' from sklearn.qda import QDA import numpy as np from sklearn import cross_validation X = [] y = [] with open(".\\File_Features.txt", "r") as g: line = g.readline() # Read a new line while line: # while line is not empty tmp = line.split(",") X.append(map(float, tmp[:-1])) y.append(tmp[-1].replace("\n", "")) line = g.readline() # Features matrix X = np.array(X) # Data vector y = np.array(y) # Label vector n_samples = len(X) rate = "{0:.2f}".format( np.mean( cross_validation.cross_val_score( QDA(), X, y, cv=10, scoring='f1_weighted') * 100)) print "Recognition Rate with Quadratic Discriminant Analysis (QDA) classifier :", rate, "%"
from sklearn.lda import LDA from sklearn.qda import QDA from supervised_pca import SupervisedPCAClassifier total_range = 100 performances = {} names = [ "LDA", "QDA", "SuperPCA thres=0", "SuperPCA thres=0.3", "SuperPCA thres=0.7" ] ncomponents = {names[2]: [], names[3]: [], names[4]: []} classifiers = [ LDA(), QDA(), SupervisedPCAClassifier(threshold=0), SupervisedPCAClassifier(threshold=0.3), SupervisedPCAClassifier(threshold=0.7) ] for name in names: performances[name] = [] # iterate over classifiers for i in range(1, total_range): X, y = make_classification(n_features=i * 10, n_redundant=i * 5, n_informative=i, random_state=1,
ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) splot.set_xticks(()) splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # LDA lda = LDA(solver='svd', store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index = 2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # QDA qda = QDA() y_pred = qda.fit(X, y, store_covariances=True).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index = 2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('LDA vs QDA') plt.show()
N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform QDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = QDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) predictions = np.array(predictions) completeness, contamination = completeness_contamination(predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------ # Compute the decision boundary
# will hold all ids and all predictions all_ids = [] all_predictions_lda = [] all_predictions_qda = [] all_predictions_lr = [] all_predictions_avg = [] subsample = 100 num_subjects = 13 num_series = 9 human_labels = ['HandStart', 'FirstDigitTouch', 'BothStartLoadPhase', 'LiftOff', 'Replace', 'BothReleased'] lr = LogisticRegression() qda = QDA() lda = LDA() for subject_id in range(1,num_subjects): y_raw = [] raw = [] # read in training data for series_id in range(1,num_series): data,labels = prepare_data_train(subject_id, series_id) raw.append(data) y_raw.append(labels) # concatanenate the data sets into one dataframe X = pd.concat(raw) y = pd.concat(y_raw)
N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform QDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = QDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) predictions = np.array(predictions) completeness, contamination = completeness_contamination( predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------
df2 = pd.read_excel('feat.xlsx', sheetname=1, header=1) x2 = np.array(df2[feature_x]).reshape(-1, 1) y2 = np.array(df2[feature_y]).reshape(-1, 1) normal_pt = np.hstack([x2, y2]) # In[48]: #Sort given training data with corresponding labels nor_n = np.zeros(int(normal_pt.size / normal_pt.ndim)) can_n = np.ones(int(cancer_pt.size / cancer_pt.ndim)) labels = np.hstack((nor_n, can_n)) train_data = np.vstack((normal_pt, cancer_pt)) # In[49]: clf = QDA() trained_clf = clf.fit(train_data, labels) normal_pred = trained_clf.predict(normal_pt) trueneg_n = (normal_pred == 0).sum() specificity = trueneg_n / int(normal_pt.size / normal_pt.ndim) # In[50]: cancer_pred = trained_clf.predict(cancer_pt) truepos_n = (cancer_pred == 1).sum() sensitivity = truepos_n / int(cancer_pt.size / cancer_pt.ndim) # In[51]: #Generate grids for the entire plot if inRedox:
from sklearn.qda import QDA h = .02 # step size in the mesh names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA()] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] figure = pl.figure(figsize=(27, 9)) i = 1 # iterate over datasets
# Similar as LDA, need not assume same covariance between classes. from sklearn.qda import QDA import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) # Visualization import matplotlib.pyplot as plt plt.figure(1) plt.scatter(X[y == 1, 0], X[y == 1, 1], color='g') plt.scatter(X[y == 2, 0], X[y == 2, 1], color='b') plt.title('X Data Set Visualization') # Classification clf = QDA() clf = clf.fit(X, y) print(clf.predict([[-0.8, -1]])) plt.show()
h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ]
splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') ############################################################################### for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # LDA lda = LDA(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # QDA qda = QDA() y_pred = qda.fit(X, y, store_covariances=True).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('LDA vs QDA') plt.show() plt.savefig('image.png')
def QuadDA(X_train, Y_train): qda = QDA() qda.fit(X_train, Y_train) return qda
def quadraticDiscriminantAnalysis(dataFile, outputFolder, regParam,parameters): inputData = yaml.load(open(dataFile)) trainingSet = inputData['training'] testingSet = inputData['testing'] inputFile = inputData['inputFile'] label = inputData['label'] resultSet = [] if not os.path.exists(outputFolder): try: os.makedirs(outputFolder) except OSError as exc: if exc.errno != errno.EEXIST: raise exc pass for i in range(len(trainingSet)): """testPredictions = [] trainLabels = [] trainFeatures = [] trainDataSet = arff.load(trainingSet[i]) for row in trainDataSet: content = list(row) trainFeatures.append(content[0:len(content)-1]) trainLabels.append(content[len(content)-1]) testFeatures = [] testLabels = [] testDataSet = arff.load(testingSet[i]) for row in testDataSet: content = list(row) testFeatures.append(content[0:len(content)-1]) testLabels.append(content[len(content)-1])""" train_df = pd.read_csv(trainingSet[i]) train_labels = train_df[label] train_features = train_df.drop(label,axis=1) test_df = pd.read_csv(testingSet[i]) test_predictions = pd.DataFrame(test_df[label]) test_features = test_df.drop(label,axis=1) qda = QDA(reg_param=regParam) qda.fit(train_features, train_labels) test_predictions['predictions'] = qda.predict(test_features) #testPredictions = np.array(qda.predict(testFeatures)).tolist() resultFile = outputFolder + '/result' + str(i + 1) + '.csv' """with open(resultFile,'w') as outfile: outfile.write('predictions:\n') outfile.write(yaml.dump(testPredictions, default_flow_style=False)) outfile.write('true_labels:\n') outfile.write(yaml.dump(testLabels, default_flow_style=False))""" test_predictions.to_csv(resultFile,index=False) resultSet.append(resultFile) resultDict = dict() #parameters = dict() resultDict['results'] = resultSet resultDict['label'] = label #parameters['parameter.p'] = regParam if not parameters: parameters['parameter']='default' resultDict['algo_params'] = parameters resultDict['split_params'] = inputData['split_params'] if 'feature_selection_parameters' in inputData: resultDict['feature_selection_parameters'] = inputData['feature_selection_parameters'] resultDict['feature_selection_algorithm'] = inputData['feature_selection_algorithm'] if 'feature_extraction_parameters' in inputData: resultDict['feature_extraction_parameters'] = inputData['feature_extraction_parameters'] resultDict['feature_extraction_algorithm'] = inputData['feature_extraction_algorithm'] if 'preprocessing_params' in inputData: resultDict['preprocessing_params'] = inputData['preprocessing_params'] resultDict['inputFile'] = inputFile resultDict['algorithm'] = "QuadraticDiscriminantAnalysis" yaml.dump(resultDict, open(outputFolder + '/results.yaml', 'w')) def main(args): inputFile = '' outputFolder = '' parameters=dict() regParam = 0.0 #float; regularizes the covariance estimate as [(1-reg_param)*Sigma + reg_param*np.eye(n_features)] try: opts,args = getopt.getopt(args, "i:o:p:", []) except getopt.GetoptError: print 'QuadraticDiscriminantAnalysis.py -i <inputFile> -o <outputFolder> -p <regParam>' sys.exit(2) for opt,arg in opts: if opt == '-i': inputFile = arg elif opt == '-o': outputFolder = arg elif opt == '-p': regParam = float(arg) parameters['parameter.p']=arg quadraticDiscriminantAnalysis(inputFile, outputFolder, regParam,parameters) if __name__ == "__main__": main(sys.argv[1:])
########################### Instantiate Classifiers ############################ classifiers = { "Logistic":LogisticRegression(), "NearestNeighbors":KNeighborsClassifier(100), "LinearSVM":SVC(kernel="linear", C=0.025), "RBFSVM":SVC(gamma=2, C=1), "DecisionTree":DecisionTreeClassifier(max_depth=32), "RandomForest":RandomForestClassifier(max_depth=None, n_estimators=200, max_features="auto",random_state=0,n_jobs=4), "RandomForest2":RandomForestClassifier(max_depth=8, n_estimators=200, max_features="auto",random_state=0,n_jobs=4), "AdaBoost":AdaBoostClassifier(n_estimators=500,random_state=0), "GradientBoost":GradientBoostingClassifier(n_estimators=500, learning_rate=1.0,max_depth=None, random_state=0), "NaiveBayes":GaussianNB(), "LDA":LDA(), "QDA":QDA() } joblist=[ (classifiers["RandomForest"],'RandomForest_signal','model_var_list_signal.csv'), # suffix and varlist #(classifiers["RandomForest"],'RandomForest_tmxpayer','model_var_list_tmxpayer.csv'), #(classifiers["RandomForest"],'RandomForest_tmxpayee','model_var_list_tmxpayee.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxpayer','model_var_list_signal_tmxpayer.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxpayee','model_var_list_signal_tmxpayee.csv'), #(classifiers["RandomForest"],'RandomForest_tmxpayer_tmxpayee','model_var_list_tmxpayer_tmxpayee.csv'), #(classifiers["RandomForest"],'RandomForest_tmxpayerpayee_comp','model_var_list_tmxpayerpayee_comp.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxboth','model_var_list_signal_tmxboth.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_120','model_var_list_signal_tmxboth_120.csv'), #(classifiers["RandomForest"],'RandomForest_signal_tmxboth_800','model_var_list_signal_tmxboth_800.csv'), #(classifiers["RandomForest2"],'RandomForest_signal_tmxboth_RF2','model_var_list_signal_tmxboth.csv'), #(classifiers["RandomForest"],'RandomForest_signal_107','model_var_list_signal_107.csv'),
def QDA_onNonDynamicData(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Getting the dataset associated with Non-Dynamic Activities on training X_NonDynamic,Y_NonDynamic = common.getDataSubset(XFull,YFull.flatten(),[4,5,6]) #Getting the dataset associated with Non-Dynamic Activities on testing X_NonDynamicTest,Y_NonDynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[4,5,6]) #Fitting data using QDA classifier clf = QDA() clf.fit(X_NonDynamic, Y_NonDynamic.flatten()) precision,recall,fscore = common.checkAccuracy(clf.predict(X_NonDynamicTest),Y_NonDynamicTest,[4,5,6]) common.createConfusionMatrix(clf.predict(X_NonDynamicTest).flatten(),Y_NonDynamicTest.flatten(),[4,5,6]) print(fscore) #Getting the dataset associated with Dynamic Activities on training X_Dynamic,Y_Dynamic = common.getDataSubset(XFull,YFull.flatten(),[1,2,3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest,Y_DynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[1,2,3]) print(len(X_DynamicTest),len(Y_DynamicTest)) #Fitting data using QDA classifier clf = QDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision,recall,fscore = common.checkAccuracy(clf.predict(X_DynamicTest),Y_DynamicTest,[1,2,3]) common.createConfusionMatrix(clf.predict(X_DynamicTest).flatten(),Y_DynamicTest.flatten(),[1,2,3]) print(fscore)