def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
class SNPForecastingStrategy(Strategy): def __init__(self,symbol,bars): self.symbol=symbol self.bars=bars self.create_periods() self.fit_model() def create_periods(self): self.start_train=datetime.datetime(2001,1,10) self.start_test=datetime.datetime(2005,1,1) self.end_period=datetime.datetime(2005,12,31) def fit_model(self): snpret=create_lagged_series(self.symbol,self.start_train,self.end_period,lags=5) X=snpret[['Lag1','Lag2']] Y=snpret['Direction'] X_train=X[X.index<self.start_test] Y_train=Y[Y.index<self.start_test] self.predictors=X[X.index>=self.start_test] self.model=QDA() self.model.fit(X_train,Y_train) def generate_signals(self): signals=pd.DataFrame(index=self.bars.index) signals['signal']=0.0 signals['signal']=self.model.predict(self.predictors) signals['signal'][0:5]=0.0 signals['positions']=signals['signal'].diff() return signals
class RegularizedQDA: """ Three types of regularization are possible: - regularized the covariance of a class toward the average variance within that class - regularize the covariance of a class toward the pooled covariance across all classes - add some constant amount of variance to each feature """ def __init__(self, avg_weight = 0.1, pooled_weight = 0, extra_variance = 0): self.avg_weight = avg_weight self.pooled_weight = pooled_weight self.extra_variance = extra_variance self.model = QDA() def fit(self, X, Y): self.model.fit(X,Y) I = np.eye(X.shape[1]) a = self.avg_weight p = self.pooled_weight ev = self.extra_variance original_weight = 1.0 - a - p scaled_pooled_cov = p * np.cov(X.T) assert scaled_pooled_cov.shape == I.shape assert all([C.shape == I.shape for C in self.model.rotations]) self.model.rotations = \ [original_weight * C + \ a * np.mean(np.diag(C)) * I + \ scaled_pooled_cov + ev * I \ for C in self.model.rotations] def predict(self, X): return self.model.predict(X)
def QDA_onNonDynamicData(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Getting the dataset associated with Non-Dynamic Activities on training X_NonDynamic, Y_NonDynamic = common.getDataSubset(XFull, YFull.flatten(), [4, 5, 6]) #Getting the dataset associated with Non-Dynamic Activities on testing X_NonDynamicTest, Y_NonDynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [4, 5, 6]) #Fitting data using QDA classifier clf = QDA() clf.fit(X_NonDynamic, Y_NonDynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_NonDynamicTest), Y_NonDynamicTest, [4, 5, 6]) common.createConfusionMatrix( clf.predict(X_NonDynamicTest).flatten(), Y_NonDynamicTest.flatten(), [4, 5, 6]) print fscore #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3]) print len(X_DynamicTest), len(Y_DynamicTest) #Fitting data using QDA classifier clf = QDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3]) common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3]) print fscore
class SNPForecastingStrategy(Strategy): """ Requires: symbol - A stock symbol on which to form a strategy on. bars - A DataFrame of bars for the above symbol.""" def __init__(self, symbol, bars): self.symbol = symbol self.bars = bars self.create_periods() self.fit_model() def create_periods(self): """Create training/test periods.""" self.start_train = datetime.datetime(2001,1,10) self.start_test = datetime.datetime(2005,1,1) self.end_period = datetime.datetime(2005,12,31) def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QDA() self.model.fit(X_train, y_train) def generate_signals(self): """Returns the DataFrame of symbols containing the signals to go long, short or hold (1, -1 or 0).""" signals = pd.DataFrame(index=self.bars.index) signals['signal'] = 0.0 # Predict the subsequent period with the QDA model signals['signal'] = self.model.predict(self.predictors) # Remove the first five signal entries to eliminate # NaN issues with the signals DataFrame signals['signal'][0:5] = 0.0 signals['positions'] = signals['signal'].diff() return signals
def qda(data,labels,n,v_type): train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type) clf = QDA() clf.fit(train_data, train_labels) y_pred = clf.predict(test_data) pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels)) report = classification_report(y_pred, test_labels, target_names=rock_names) cm = confusion_matrix(test_labels, y_pred) return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"QDA"
def QDA_onNonDynamicData(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Getting the dataset associated with Non-Dynamic Activities on training X_NonDynamic,Y_NonDynamic = common.getDataSubset(XFull,YFull.flatten(),[4,5,6]) #Getting the dataset associated with Non-Dynamic Activities on testing X_NonDynamicTest,Y_NonDynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[4,5,6]) #Fitting data using QDA classifier clf = QDA() clf.fit(X_NonDynamic, Y_NonDynamic.flatten()) precision,recall,fscore = common.checkAccuracy(clf.predict(X_NonDynamicTest),Y_NonDynamicTest,[4,5,6]) common.createConfusionMatrix(clf.predict(X_NonDynamicTest).flatten(),Y_NonDynamicTest.flatten(),[4,5,6]) print fscore #Getting the dataset associated with Dynamic Activities on training X_Dynamic,Y_Dynamic = common.getDataSubset(XFull,YFull.flatten(),[1,2,3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest,Y_DynamicTest = common.getDataSubset(XFullTest,YFullTest.flatten(),[1,2,3]) print len(X_DynamicTest),len(Y_DynamicTest) #Fitting data using QDA classifier clf = QDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision,recall,fscore = common.checkAccuracy(clf.predict(X_DynamicTest),Y_DynamicTest,[1,2,3]) common.createConfusionMatrix(clf.predict(X_DynamicTest).flatten(),Y_DynamicTest.flatten(),[1,2,3]) print fscore
def QDA_onFullDataset(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Fitting data using QDA classifier clf = QDA() clf.fit(XFull, YFull.flatten()) #Testing the results precision,recall,fscore = common.checkAccuracy(clf.predict(XFullTest),YFullTest,[1,2,3,4,5,6]) print(fscore)
def QDA_onFullDataset(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') #Fitting data using QDA classifier clf = QDA() clf.fit(XFull, YFull.flatten()) #Testing the results precision,recall,fscore = common.checkAccuracy(clf.predict(XFullTest),YFullTest,[1,2,3,4,5,6]) print fscore
def runQDA(fileNamaParam, trainizingSizeParam): # what percent will you use ? testSplitSize = 1.0 - trainizingSizeParam testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] ### classification ## get the test and training sets featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split(trainData, testData, test_size=testSplitSize, random_state=0) ## fire up the model theQDAModel = QDA() theQDAModel.fit(featureSpace_train, vScore_train) thePredictedScores = theQDAModel.predict(featureSpace_test) #print "The original vector: " #print vScore_test #print "The predicted score vector: " #print thePredictedScores evalClassifier(vScore_test, thePredictedScores)
def runQDA(fileNamaParam, trainizingSizeParam): # what percent will you use ? testSplitSize = 1.0 - trainizingSizeParam testAndTrainData = IO_.giveTestAndTrainingData(fileNamaParam) trainData = testAndTrainData[0] testData = testAndTrainData[1] ### classification ## get the test and training sets featureSpace_train, featureSpace_test, vScore_train, vScore_test = cross_validation.train_test_split( trainData, testData, test_size=testSplitSize, random_state=0) ## fire up the model theQDAModel = QDA() theQDAModel.fit(featureSpace_train, vScore_train) thePredictedScores = theQDAModel.predict(featureSpace_test) #print "The original vector: " #print vScore_test #print "The predicted score vector: " #print thePredictedScores evalClassifier(vScore_test, thePredictedScores)
def qda(input_file,Output): lvltrace.lvltrace("LVLEntree dans qda") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape qda=QDA() qda.fit(X,y) y_pred = qda.predict(X) print "#########################################################################################################\n" print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"QDA_metrics.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "QDA" save = Output + "QDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda")
def qda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans qda split_test") try: ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=QDA() lda.fit(X_train,y_train) y_pred = lda.predict(X_test) print "Quadratic Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"QDA_metrics_test.txt" file = open(results, "w") file.write("Quadratic Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "QDA %f"%test_size save = Output + "QDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) except (AttributeError): if configuration.normalization == 'normalize': results = Output+"Multinomial_NB_metrics_test.txt" file = open(results, "w") file.write("In configuration.py file, normalization='normalize' -- Input Values must be superior to 0\n") file.close() lvltrace.lvltrace("LVLSortie dans qda split_test")
class QDAClassifier(Classifier): '''Quadratic Discriminant analysis classifier''' def __init__(self): super(QDAClassifier, self).__init__() self.fig = 20 self.is_trainable = True self.is_trained = False def train(self, classification_data, indices=None, settings_name=None, **kwargs): super(QDAClassifier, self).train(classification_data, indices, settings_name, **kwargs) indices = self.settings['indices'] self.qda = QDA(**self.classifier_kwargs) self.qda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual) return self def classify(self, classification_data): super(QDAClassifier, self).classify(classification_data) indices = self.settings['indices'] self.are_hurr_pred = self.qda.predict(classification_data.data[:, indices]) return self.are_hurr_pred
# Quadratic Discriminant Analysis from sklearn import datasets from sklearn import metrics from sklearn.qda import QDA # load the iris datasets dataset = datasets.load_iris() # fit a QDA model to the data model = QDA() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
# Preprocessing for ind in range(X.shape[0]): X[ind, :] = where(X[ind, :] > mean(X[ind, :]), 0., 1.) # PCA pca = PCA(50) pca.fit(X) X50 = pca.transform(X) # QDA qda = QDA() qda.fit(X50, y) # Accuracy score p = qda.predict(X50) print 'Accuracy score = ', accuracy_score(p, y) # Read in test data test = pd.read_csv('test.csv') test = test.values # Preprocessing for ind in range(test.shape[0]): test[ind, :] = where(test[ind, :] > mean(test[ind, :]), 0., 1.) # Transform data test = pca.transform(test) # Predict p = qda.predict(test)
E_neigh = accuracy_score(y_test, yhat_neigh, normalize=True) print E_neigh # On obtient E_neigh ~ 0.95 # II/3 prediction using Linear Discriminant Analysis (LDA) LDA = LDA() LDA.fit(X_train, y_train) yhat_LDA = LDA.predict(X_test) E_LDA = accuracy_score(y_test, yhat_LDA, normalize=True) print E_LDA # KNN better than LDA (E_LDA ~ 0.86) # II/3 prediction using Quadratic Discriminant Analysis (QDA) QDA = QDA() QDA.fit(X_train, y_train) yhat_QDA = QDA.predict(X_test) E_QDA = accuracy_score(y_test, yhat_QDA, normalize=True) print E_QDA # voir pb colinéarité des variables # II/4 prediction using Random Forests (RF) RF = RandomForestClassifier() RF.fit(X_train, y_train) yhat_RF = RF.predict(X_test) E_RF = accuracy_score(y_test, yhat_RF, normalize=True) print E_RF # On obtient E_RF ~ 0.91 # Analysons plus finement le paramétrage des RF pour améliorer ce résultat ''' N.B. : pour analyser vraiment finement les résultats, avoir des résultats robustes, il faudrait faire des tirages aléatoires et répéter les étapes au moins 200 fois et en faire une moyenne'''
# classifier regularization parameter p_best = 0 count_best = 0 # begin training model print 'Training model in progress...' for j in range(200): p = 0.02*j clf = QDA(reg_param=p) clf.fit(X_train, y_train) count = 0 # fit in the test set for i in range(len(y_cross)): a = clf.predict(X_cross[i]) b = y_cross[i] if (a == b): count += 1 # update the regularization parameter if count > count_best: count_best = count p_best = p print "Progress at %.1f%%" %(j/2) print 'Training model completed' # test the model clf = QDA(reg_param=p_best)
lda.fit(train_weekly_x, train_weekly_y) lda_preds = lda.predict(test_weekly_x) lda_score = lda.score(test_weekly_x, test_weekly_y) conf_matrix = confusion_matrix(test_weekly_y, lda_preds) print "\nLDA Results" print "Confusion Matrix:" print conf_matrix print "Fraction of Correct Predictions: " + str(lda_score) #%% QDA using sklearn from sklearn.qda import QDA qda = QDA() qda.fit(train_weekly_x, train_weekly_y) qda_preds = qda.predict(test_weekly_x) qda_score = qda.score(test_weekly_x, test_weekly_y) conf_matrix = confusion_matrix(test_weekly_y, qda_preds) print "\nQDA Results" print "Confusion Matrix:" print conf_matrix print "Fraction of Correct Predictions: " + str(qda_score) #%% KNN using sklearn from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=1) knn.fit(train_weekly_x, train_weekly_y) knn_preds = knn.predict(test_weekly_x) knn_score = knn.score(test_weekly_x, test_weekly_y)
remove = remove.union(redundant) print("For correlation coefficient = ", coefficient) #print(remove) #print(add) train_data = pd.DataFrame(data=train_data_g, columns = df.columns)[df.columns- remove].values test_data = pd.DataFrame(data=test_data_g, columns = df.columns)[df.columns- remove].values print("num of featurs = ", train_data.shape[1]) clf = QDA(); # This gets the time in ipython shell. print("Modelling time:") %time clf.fit(train_data, train_labels) print("Modelling time ends") print("prediction time starts:") %time predicted_labels = clf.predict(test_data) print("prediction time ends") #print(classification_report(test_labels, clf.predict(test_data))) print(classification_report(test_labels, predicted_labels)) print("num of featurs = ", train_data.shape[1]) y_true = test_labels; y_pred_proba = clf.predict_proba(test_data); fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba[:, 1]) roc_auc = auc(fpr, tpr) print("ROC AUC =", roc_auc) print("\n\n\n")
error(y_test, y_predict_g, 'Gaussian Naive Bayes') error6(y_test, y_predict_g, 'Gaussian Naive Bayes') ############################################################################### # 3 LDA LDA = LDA() LDA.fit(X_train, y_train) y_predict_lda = LDA.predict(X_test) error(y_test, y_predict_lda, 'LDA') error6(y_test, y_predict_lda, 'LDA') ############################################################################### # 4 QDA QDA = QDA() QDA.fit(X_train, y_train) y_predict_qda = QDA.predict(X_test) error(y_test, y_predict_qda, 'QDA') error6(y_test, y_predict_qda, 'QDA') ############################################################################### # 5 Logistic Regression LR = LogisticRegression() LR.fit(X_train, y_train) y_predict_lr = LR.predict(X_test) error(y_test, y_predict_lr, 'Logistic Regression') error6(y_test, y_predict_lr, 'Logistic Regression') ############################################################################### # 6 K-Neighbors Classifier KNC = KNeighborsClassifier(8) KNC.fit(X_train, y_train)
'PitchesVar[6]','PitchesVar[7]','PitchesVar[8]','PitchesVar[9]','PitchesVar[10]','PitchesVar[11]', 'TimbreMean[0]','TimbreMean[1]','TimbreMean[2]','TimbreMean[3]','TimbreMean[4]','TimbreMean[5]', 'TimbreMean[6]','TimbreMean[7]','TimbreMean[8]','TimbreMean[9]','TimbreMean[10]','TimbreMean[11]', 'TimbreVar[0]','TimbreVar[1]','TimbreVar[2]','TimbreVar[3]','TimbreVar[4]','TimbreVar[5]', 'TimbreVar[6]','TimbreVar[7]','TimbreVar[8]','TimbreVar[9]','TimbreVar[10]','TimbreVar[11]'] df_input = pandas.read_csv('pandas_merged_output_cleaned_None.csv', header=None, delimiter="|", names=col_input) df_input = df_input.dropna() #df_input = df_input[df_input['Year'] != 0][df_input['genre'] != 'CLASSICAL'] #df_input = df_input[df_input['Year'] != 0][df_input['Year'] < 1992][df_input['genre'] != 'CLASSICAL'] df_input = df_input[df_input['Year'] != 0][df_input['Year'] >= 1992][df_input['genre'] != 'CLASSICAL'] df_input_target = df_input[list(range(0, 1))].as_matrix() df_input_data = df_input[list(range(1, 70))].as_matrix() # splitting the data into training and testing sets from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(df_input_data, df_input_target.tolist()) # Start QDA Classification from sklearn.qda import QDA clf = QDA(priors=None, reg_param=0.001).fit(X_train, np.ravel(y_train[:])) predicted = clf.predict(X_test) matches = (predicted == [item for sublist in y_test for item in sublist]) print "Accuracy : ", (matches.sum() / float(len(matches)))
else: y_home.append(1) all_data.append(data_instance) # --- QDA Classification --- print 'Fitting data' home_qda = QDA() home_qda.fit(all_data, y_home) correct = 0 print 'Predicting results: ' for i in range(training_size, size, 2): inst = x[i].reshape(1, -1) ph = home_qda.predict(inst) if ph == 0: if y[i] > y[i + 1]: correct += 1 elif ph == 2: if y[i] == y[i + 1]: correct += 1 else: if y[i] < y[i + 1]: correct += 1 error_rate = (correct * 1.0) / ((size - training_size) / 2) print 'Success rate for QDA: ', error_rate
#np.concatenate, np.hstack and np.insert didn't work, because axis has to larger than 1. #if use np.concatenate, np.hstack and np.insert, np.expand_dims should be used firstly. X = np.column_stack((X, training_zeros)) lda = LDA(n_components=1) lda = lda.fit(X, y) predict_LDA= lda.predict(testSet) res = pd.crosstab(predict_LDA, actual_testSet) print(res) correct_rate= np.mean(actual_testSet == predict_LDA) print('overall fraction of correct predictions: {correct_rate}'.format(correct_rate=correct_rate)) #Question F print('(f) ') qda = QDA() qda = qda.fit(X, y) predict_QDA = qda.predict(testSet) res = pd.crosstab(predict_QDA, actual_testSet) print(res) correct_rate= np.mean(actual_testSet == predict_QDA) print('overall fraction of correct predictions: {correct_rate}'.format(correct_rate=correct_rate)) #Question G print('(g) ') knn = KNeighborsClassifier(n_neighbors=1) knn = knn.fit(X, y) predict_KNN = knn.predict(testSet) res = pd.crosstab(predict_KNN, actual_testSet) print(res) correct_rate= np.mean(actual_testSet == predict_KNN) print('overall fraction of correct predictions: {correct_rate}'.format(correct_rate=correct_rate))
# Similar as LDA, need not assume same covariance between classes. from sklearn.qda import QDA import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) # Visualization import matplotlib.pyplot as plt plt.figure(1) plt.scatter(X[y == 1, 0], X[y == 1, 1], color='g') plt.scatter(X[y == 2, 0], X[y == 2, 1], color='b') plt.title('X Data Set Visualization') # Classification clf = QDA() clf = clf.fit(X, y) print(clf.predict([[-0.8, -1]])) plt.show()
test.append(temp) for line in trainData: temp=[] line = line.split(',') if line[0]=='x.1': continue for i in range(0,len(line)): if i<0: trainTarget.append(float(line[i].strip())) else: temp.append(float(line[i].strip())) train.append(temp) return train,test,trainTarget,testTarget if __name__ =='__main__': fn1='./test.data' fn2='./train.data' fn3='./testTarget.data' fn4='./trainTarget.data' train,test,trainTarget,testTarget=InitData(fn1,fn2,fn3,fn4) clf = QDA() clf.fit(train,trainTarget) y_pred=clf.predict(test) print "error" print 1-np.mean(y_pred==testTarget)
def classifierTrainTest(score, diagn, real_art, cvPartition, classifier, subjIndex, preAccMatrix, preInstOrder): x = 0 iteration = 0 idx = 0 PCNo = len(score[0]) subAccMatrix = 0 # FIX: what is test->matlab function within cvpartition class #idx = numpy.random.rand(cvPartition, iteration) #idx_test = numpy.where(idx == 1) #idx_train = numpy.where(idx != 1) print("cvPartition:") print(cvPartition) #QUESTION: cv partition not scalar ,how works #iteration must be atleast 2 for idx_train, idx_test in cvPartition: #change idx to boolean array idx = numpy.zeros((len(score), 1), dtype=bool) for index in idx_test: idx[index] = True #for testing purposes #idx = numpy.zeros((len(score), 1), dtype=bool) #idx[47] = True #idx is all training in MATLAB implementation? cvTEST = numpy.zeros((sum(idx), PCNo)) diagnTEST = numpy.zeros((sum(idx), 1)) real_artTEST = numpy.zeros((sum(idx), 1)) instIndexTEST = numpy.zeros((sum(idx), 1)) cvTRAIN = numpy.zeros((len(idx) - sum(idx), PCNo)) diagnTRAIN = numpy.zeros((len(idx) - sum(idx), 1)) real_artTRAIN = numpy.zeros((len(idx) - sum(idx), 1)) k = 0 m = 0 for j in range(len(idx)): if idx[j] == 1: cvTEST[k, :] = score[j, :] diagnTEST[k] = diagn[j] real_artTEST[k] = real_art[j] instIndexTEST[k] = subjIndex[j] k = k + 1 else: cvTRAIN[m, :] = score[j, :] diagnTRAIN[m] = diagn[j] real_artTRAIN[m] = real_art[j] m = m + 1 # FIX: use scikit-learn for classifiers and predictions if classifier == "lda": #ldaModel = LDA() priorsArrays = numpy.array((.5, .5)) ldaModel = LDA(solver='eigen', priors=priorsArrays, shrinkage=1.00) #ldaModel = LDA() ldaModel.fit(cvTRAIN, diagnTRAIN) label = ldaModel.predict(cvTEST) elif classifier == 'qda': # training a quadratic discriminant classifier to the data #qdaModel = QDA() priorsArrays = numpy.array((.5, .5)) qdaModel = QDA(solver='eigen', priors=priorsArrays, shrinkage=1.00) qdaModel.fit(cvTRAIN, diagnTRAIN) label = qdaModel.predict(cvTEST) elif classifier == 'tree': # training a decision tree to the data treeModel = tree() treeModel.fit(cvTRAIN, diagnTRAIN) label = treeModel.predict(cvTEST) elif classifier == 'svm': # training a support vector machine to the data svmModel = SVC() svmModel.fit(cvTRAIN, diagnTRAIN) label = svmModel.predict(cvTEST) trueClassLabel = diagnTEST predictedClassLabel = label #from former loop subAccMatrix = numpy.column_stack( (trueClassLabel, predictedClassLabel, real_artTEST)) preAccMatrix[x:x + len(subAccMatrix[:, 0]), :] = subAccMatrix preInstOrder[x:x + len(instIndexTEST[:, 0])] = instIndexTEST x = x + len(subAccMatrix[:, 0]) #for testing purposes #break # create dictionary for return values return { 'cvTEST': cvTEST, 'diagnTEST': diagnTEST, 'real_artTEST': real_artTEST, 'instIndexTEST': instIndexTEST, 'cvTRAIN': cvTRAIN, 'diagnTRAIN': diagnTRAIN, 'real_artTRAIN': real_artTRAIN, 'trueClassLabel': trueClassLabel, 'predictedClassLabel': predictedClassLabel, 'idx': idx, 'subAccMatrix': subAccMatrix, 'preAccMatrix': preAccMatrix, 'preInstOrder': preInstOrder }
# classifier regularization parameter p_best = 0 count_best = 0 # begin training model print 'Training model in progress...' for j in range(200): p = 0.02 * j clf = QDA(reg_param=p) clf.fit(X_train, y_train) count = 0 # fit in the test set for i in range(len(y_cross)): a = clf.predict(X_cross[i]) b = y_cross[i] if (a == b): count += 1 # update the regularization parameter if count > count_best: count_best = count p_best = p print "Progress at %.1f%%" % (j / 2) print 'Training model completed' # test the model clf = QDA(reg_param=p_best)
Test.append(1) if(i.split(',')[4]=='Iris-virginica'): Test.append(2) except: pass h=0.02 Y=Test X=numpy.transpose(Data) #clf=LDA() #clf=QDA() clf=QDA(reg_param=0.3) x=clf.fit(Data,Test) x_min, x_max = X[0].min() - .5, X[0].max() + .5 y_min, y_max = X[1].min() - .5, X[1].max() + .5 xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, h), numpy.arange(y_min, y_max, h)) Z = clf.predict(numpy.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1, figsize=(4, 3)) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) # Plot also the training points plt.scatter(X[0], X[1], c=Y, edgecolors='k', cmap=plt.cm.Paired) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(())
import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split data_nonLinear = np.genfromtxt('exp2d2.txt', delimiter=',') temp = pd.DataFrame(data_nonLinear) #Data Visualization temp.head() x = data_nonLinear[:, 0:2] y = data_nonLinear[:, np.newaxis, 2] y = y.ravel() x_train, x_test, y_train, y_test = train_test_split(x, y) #from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.qda import QDA qdd = QDA() #ldd=LinearDiscriminantAnalysis() qdd.fit(x_train, y_train) y_pred = qdd.predict(x_test) from sklearn import metrics print(metrics.accuracy_score(y_test, y_pred)) #Plotting import matplotlib.pyplot as plt x1 = x_test[:, 0] x2 = x_test[:, 1] for i in range(0, len(y_test)): c = ['red' if y_test[i] == 1 else 'blue'] plt.scatter(x1[i], x2[i], color=c) plt.hold(True) plt.hold(False) plt.xlabel('x1') plt.ylabel('x2') plt.show()
for i in range(1,11): sklearn_pca = PCA(n_components=i) Xred_pca = sklearn_pca.fit_transform(X_std) Xred_pca_test = sklearn_pca.transform(X_std_test) lda_model = LDA() lda_model.fit(Xred_pca,y) yhat_train = lda_model.predict(Xred_pca) lda_train.append(zero_one_loss(y, yhat_train)) yhat_test = lda_model.predict(Xred_pca_test) lda_test.append(zero_one_loss(ytest, yhat_test)) qda_model = QDA() qda_model.fit(Xred_pca,y) yhat_train = qda_model.predict(Xred_pca) qda_train.append(zero_one_loss(y, yhat_train)) yhat_test = qda_model.predict(Xred_pca_test) qda_test.append(zero_one_loss(ytest, yhat_test)) knn_model = KNeighborsClassifier(n_neighbors=7) knn_model.fit(Xred_pca,y) yhat_train = knn_model.predict(Xred_pca) knn_train.append(zero_one_loss(y, yhat_train)) yhat_test = knn_model.predict(Xred_pca_test) knn_test.append(zero_one_loss(ytest, yhat_test)) plt.figure(figsize=(12, 8)) plt.plot(lda_train, label="Training set") plt.plot(lda_test, label="Test set")
class SNPForecastingStrategy(Strategy): """Requires: symbol - A stock symbol to form a strategy. bars - A df of bars for the above symbol.""" def __init__(self, symbol, bars): self.symbol = symbol self.bars = bars self.create_periods() self.fit_model() def create_periods(self): """Create training/test periods.""" self.start_train = datetime.datetime(2001,1,10) self.start_test = datetime.datetime(2005,1,1) self.end_period = datetimme.datetime(2005,12,31) def fit_model(self): """Fits a Quadratic Discriminat Analyser to the US sock market index (^GPSC in Yahoo).""" # Create a laggged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor value, with direction as the response X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the prediciting factors for use # in direction forecasting. self.predictors = X[X.index >= self.start_test] # Create the Quadractic Discriminant Analysis model # and the forcasting strategy self.model = QDA() self.model.fit(X_train, y_train) def generate_signals(self): """Returns the df of symbols containing the signals to go long, short, or hold (1, -1, 0).""" signals = pd.DataFrame(index=self.bars.index) signals['signal'] = 0.0 # predict the subsequent period with the QDA model signals['signal'] = self.model.predict(self.predictors) # Remove the first five signals entries to eliminate # NaN issues with the signal df signals[''signal'][0:5] = 0.0 signals['positions'] = signals['signal'].diff() return signals class MarketingIntradayPortfolio(Portfolio): """Buys or sells 500 shares of an asset at the opening price of every bar, depending upon the direction of the forcast, closing out the trade at the close of the bar. Requires: symbol - A stock symbol which forms the basis of the portfolio. bars - A df of bars for a symbol set. signal - A df of signals (1, 0, -1) for each symbol initial_capital - The amount in cash at the start of the portfolio.""" def __init__(self, symbol, bars, signals, initial_capital=100000.0): self.symbol = symbol self.bars = bars self.signals = signals self.initial_capital = float(initial_capital) self.positions = self.generate_positions() def generate_positions(self): """Generate the positions df, based on the signals provided by the 'signals; df.""" positions = pd.DataFrame(index=self.signals.index).fillna(0.0) # Long or short 500 shares of SPY based on directional signal every day positions[self.symbol] = 500*self.signals['signal'] return positions def backtest_portfolio(self): """Backtest the portfolio and return a df containing the equity curve and the percentage returns."" # Set the portfolio object to have the same time period # as the positions df. portfolio = pd.DateFrame(index=self.positions.index) pos_diff = self.positions.diff() # Work out the intraday profit of the difference # in open and closing prices and then determine # the daily profit by longing if an up day is predicted # and shorting if a down day is predicted portfolio['price_diff'] = self.bars['Close'] - self.bars['Open'] portfolio['price_diff'][0:5] = 0.0 portfolio['profit'] = self.positions[self.symbol] * portfolio['price_diff'] # Generate the equity curve and percentage returns portfolio['total'] = self.initial_capital + portfolio['profit'].cumsum() portfolio['returns'] = portfolio['total'].pct_change() return portfolio
recall_lda_50 = pre_rec(cmatrix_lda_50, y_test_count) precision_lda_50 = pre_rec(cmatrix_lda_50, y_pred_count_lda_50) accuracy_lda_50 = overall_accuracy(cmatrix_lda_50, y_test) print precision_lda_50 print recall_lda_50 print accuracy_lda_50 print "####################################################################" ############################################################################### ############################################################################### # 3 QDA qda = QDA() qda.fit(X_train, y_train) y_predict_qda = qda.predict(X_test) y_pred_count_qda = total_count(y_predict_qda) cmatrix_qda = confusion_matrix(y_test, y_predict_qda) print "\nQDA:" print cmatrix_qda print "" recall_qda = pre_rec(cmatrix_qda, y_test_count) precision_qda = pre_rec(cmatrix_qda, y_pred_count_qda) accuracy_qda = overall_accuracy(cmatrix_qda, y_test) print precision_qda print recall_qda print accuracy_qda
#Question D print('(d) ') lda = LDA() lda = lda.fit(trainingData, trainingResponse) predict_LDA = lda.predict(testData) res = pd.crosstab(predict_LDA, testResponse.values) print(res) error_rate = np.mean(predict_LDA != testResponse.values) print('LDA test error is {error_rate}'.format(error_rate=error_rate)) #Question E print('(e) ') qda = QDA() qda = qda.fit(trainingData, trainingResponse) predict_QDA = qda.predict(testData) res = pd.crosstab(predict_QDA, testResponse.values) print(res) error_rate = np.mean(predict_QDA != testResponse.values) print('QDA test error is {error_rate}'.format(error_rate=error_rate)) #Question F print('(f) ') lr = LogisticRegression() lr = lr.fit(trainingData, trainingResponse) predict_LR = lr.predict(testData) res = pd.crosstab(predict_LR, testResponse.values) print(res) error_rate = np.mean(predict_LR != testResponse.values) print('Logistic Regression test error is {error_rate}'.format(error_rate=error_rate))
# ***************************************************************************** # Quadratic Discriminant Analysis from sklearn import datasets from sklearn import metrics from sklearn.qda import QDA # load the iris datasets dataset = datasets.load_iris() # fit a QDA model to the data model = QDA() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
# Similar as LDA, need not assume same covariance between classes. from sklearn.qda import QDA import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) # Visualization import matplotlib.pyplot as plt plt.figure(1) plt.scatter(X[y == 1, 0], X[y == 1, 1], color="g") plt.scatter(X[y == 2, 0], X[y == 2, 1], color="b") plt.title("X Data Set Visualization") # Classification clf = QDA() clf = clf.fit(X, y) print(clf.predict([[-0.8, -1]])) plt.show()
def quadraticDiscriminantAnalysis(dataFile, outputFolder, regParam,parameters): inputData = yaml.load(open(dataFile)) trainingSet = inputData['training'] testingSet = inputData['testing'] inputFile = inputData['inputFile'] label = inputData['label'] resultSet = [] modelset = [] importanceset = [] if not os.path.exists(outputFolder): try: os.makedirs(outputFolder) except OSError as exc: if exc.errno != errno.EEXIST: raise exc pass modelsfolder = outputFolder + "/models/" if not os.path.exists(modelsfolder): try: os.makedirs(modelsfolder) except OSError as exc: if exc.errno != errno.EEXIST: raise exc pass importancefolder = outputFolder + "/FeatureImportance/" if not os.path.exists(importancefolder): try: os.makedirs(importancefolder) except OSError as exc: if exc.errno != errno.EEXIST: raise exc pass for i in range(len(trainingSet)): train_df = pd.read_csv(trainingSet[i]) train_labels = train_df[label] train_features = train_df.drop(label,axis=1) test_df = pd.read_csv(testingSet[i]) test_predictions = pd.DataFrame(test_df[label]) test_features = test_df.drop(label,axis=1) qda = QDA(reg_param=regParam) qda.fit(train_features, train_labels) modelFile = modelsfolder + "QuadraticDiscriminantAnalysisModel" + str(i+1) + ".pkl" with open(modelFile, 'wb') as fd: pickle.dump(qda, fd) modelset.append(modelFile) fd.close() importanceFile = calculateFeatureImportance(train_features,qda,importancefolder,i) importanceset.append(importanceFile) test_predictions['predictions'] = qda.predict(test_features) resultFile = outputFolder + '/result' + str(i + 1) + '.csv' test_predictions.to_csv(resultFile,index=False) resultSet.append(resultFile) resultDict = dict() resultDict['results'] = resultSet resultDict['models'] = modelset resultDict['featureimportance'] = importanceset resultDict['label'] = label if not parameters: parameters['parameter']='default' resultDict['algo_params'] = parameters resultDict['split_params'] = inputData['split_params'] if 'feature_selection_parameters' in inputData: resultDict['feature_selection_parameters'] = inputData['feature_selection_parameters'] resultDict['feature_selection_algorithm'] = inputData['feature_selection_algorithm'] if 'feature_extraction_parameters' in inputData: resultDict['feature_extraction_parameters'] = inputData['feature_extraction_parameters'] resultDict['feature_extraction_algorithm'] = inputData['feature_extraction_algorithm'] if 'preprocessing_params' in inputData: resultDict['preprocessing_params'] = inputData['preprocessing_params'] resultDict['inputFile'] = inputFile resultDict['algorithm'] = "QuadraticDiscriminantAnalysis" yaml.dump(resultDict, open(outputFolder + '/results.yaml', 'w'))
std = np.std(dataset[:,0]) dataset[:,0] = (dataset[:,0] - mean) / std for i in range(len(dataset)): if dataset[i,10] == 2: dataset[i, 10] = 0 target = dataset[:,-1] data = dataset[:,:14] expected = target #QDA clf = QDA() clf.fit(data, target) predicted = clf.predict(data) #LDA clf = LDA() clf.fit(data, target) predicted = clf.predict(data) #Gaussian model = GaussianNB() model.fit(data, target) print(model) #data for prediction:
# KNN knn1 = KNeighborsClassifier(n_neighbors=2) knn1 = knn1.fit(X_train, y_train) knn1.score(X_train, y_train) knnpredict = knn1.predict(X_test) print knnpredict confusion_matrix(y_test, knnpredict) print metrics.accuracy_score(y_test, knnpredict) knn2 = KNeighborsClassifier(n_neighbors=10) knn2 = knn2.fit(X_train, y_train) knn2.score(X_train, y_train) knnpredict1 = knn2.predict(X_test) print knnpredict1 confusion_matrix(y_test, knnpredict1) print metrics.accuracy_score(y_test, knnpredict1) # QDA qda1 = QDA() qda1 = qda1.fit(X_train, y_train) qda1.score(X_train, y_train) qdapredict = qda1.predict(X_test) print qdapredict confusion_matrix(y_test, qdapredict) print metrics.accuracy_score(y_test, qdapredict) # Strategies to improve the test accuracy #We can do algorithm tuning Since machine learning algorithms are driven by parameters, these parameters will influence the outcome of learning.Objective of this is to find optimum value for each parameter to improve accuracy of the model. We should check the impact of each of these parameters on the model. # By applying ensemble methods such as bagging and boosting we can improve the accuracy of the model
def qda_fit(self): qda_res = QDA().fit(self.train_X.values, self.train_y.values) pred_y = qda_res.predict(self.test_X.values) tp.output_table(pred_y, self.test_y.values)
def quadraticDiscriminantAnalysis(dataFile, outputFolder, regParam,parameters): inputData = yaml.load(open(dataFile)) trainingSet = inputData['training'] testingSet = inputData['testing'] inputFile = inputData['inputFile'] label = inputData['label'] resultSet = [] if not os.path.exists(outputFolder): try: os.makedirs(outputFolder) except OSError as exc: if exc.errno != errno.EEXIST: raise exc pass for i in range(len(trainingSet)): """testPredictions = [] trainLabels = [] trainFeatures = [] trainDataSet = arff.load(trainingSet[i]) for row in trainDataSet: content = list(row) trainFeatures.append(content[0:len(content)-1]) trainLabels.append(content[len(content)-1]) testFeatures = [] testLabels = [] testDataSet = arff.load(testingSet[i]) for row in testDataSet: content = list(row) testFeatures.append(content[0:len(content)-1]) testLabels.append(content[len(content)-1])""" train_df = pd.read_csv(trainingSet[i]) train_labels = train_df[label] train_features = train_df.drop(label,axis=1) test_df = pd.read_csv(testingSet[i]) test_predictions = pd.DataFrame(test_df[label]) test_features = test_df.drop(label,axis=1) qda = QDA(reg_param=regParam) qda.fit(train_features, train_labels) test_predictions['predictions'] = qda.predict(test_features) #testPredictions = np.array(qda.predict(testFeatures)).tolist() resultFile = outputFolder + '/result' + str(i + 1) + '.csv' """with open(resultFile,'w') as outfile: outfile.write('predictions:\n') outfile.write(yaml.dump(testPredictions, default_flow_style=False)) outfile.write('true_labels:\n') outfile.write(yaml.dump(testLabels, default_flow_style=False))""" test_predictions.to_csv(resultFile,index=False) resultSet.append(resultFile) resultDict = dict() #parameters = dict() resultDict['results'] = resultSet resultDict['label'] = label #parameters['parameter.p'] = regParam if not parameters: parameters['parameter']='default' resultDict['algo_params'] = parameters resultDict['split_params'] = inputData['split_params'] if 'feature_selection_parameters' in inputData: resultDict['feature_selection_parameters'] = inputData['feature_selection_parameters'] resultDict['feature_selection_algorithm'] = inputData['feature_selection_algorithm'] if 'feature_extraction_parameters' in inputData: resultDict['feature_extraction_parameters'] = inputData['feature_extraction_parameters'] resultDict['feature_extraction_algorithm'] = inputData['feature_extraction_algorithm'] if 'preprocessing_params' in inputData: resultDict['preprocessing_params'] = inputData['preprocessing_params'] resultDict['inputFile'] = inputFile resultDict['algorithm'] = "QuadraticDiscriminantAnalysis" yaml.dump(resultDict, open(outputFolder + '/results.yaml', 'w')) def main(args): inputFile = '' outputFolder = '' parameters=dict() regParam = 0.0 #float; regularizes the covariance estimate as [(1-reg_param)*Sigma + reg_param*np.eye(n_features)] try: opts,args = getopt.getopt(args, "i:o:p:", []) except getopt.GetoptError: print 'QuadraticDiscriminantAnalysis.py -i <inputFile> -o <outputFolder> -p <regParam>' sys.exit(2) for opt,arg in opts: if opt == '-i': inputFile = arg elif opt == '-o': outputFolder = arg elif opt == '-p': regParam = float(arg) parameters['parameter.p']=arg quadraticDiscriminantAnalysis(inputFile, outputFolder, regParam,parameters) if __name__ == "__main__": main(sys.argv[1:])
recall_lda = pre_rec(cmatrix_lda, y_test_count) precision_lda = pre_rec(cmatrix_lda, y_pred_count_lda) accuracy_lda = overall_accuracy(cmatrix_lda, y_test) print precision_lda print recall_lda print accuracy_lda ############################################################################### ############################################################################### # 3 QDA qda = QDA() qda.fit(X_train, y_train) y_predict_qda = qda.predict(X_test) y_pred_count_qda = total_count(y_predict_qda) cmatrix_qda = confusion_matrix(y_test, y_predict_qda) print "\nQDA:" print cmatrix_qda print "" recall_qda = pre_rec(cmatrix_qda, y_test_count) precision_qda = pre_rec(cmatrix_qda, y_pred_count_qda) accuracy_qda = overall_accuracy(cmatrix_qda, y_test) print precision_qda print recall_qda print accuracy_qda
N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform QDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = QDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) predictions = np.array(predictions) completeness, contamination = completeness_contamination( predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1]
display_1 = [2, 2] display_2 = [3, 1] display_3 = [2.5, 2.5] values_proba_qda_1 = np.exp(clf.predict_log_proba(display_1))[0] values_proba_qda_2 = np.exp(clf.predict_log_proba(display_2))[0] values_proba_qda_3 = np.exp(clf.predict_log_proba(display_3))[0] fig3 = plt.figure() plot_2d(X, y) resolution_param = 500 # 500 for nice plotting, 50 for fast version color_text = '#ff8101' frontiere(lambda xx: clf.predict(xx), X, step=resolution_param) plt.annotate(r'' + '(%.2f' % values_proba_qda_1[0] + ', %.2f' % values_proba_qda_1[1] + ', %.2f)' % values_proba_qda_1[2], xy=(display_1[0], display_1[1]), xycoords='data', color=color_text, xytext=(-150, +100), textcoords='offset points', fontsize=12, arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2", color=color_text)) plt.plot(display_1[0], display_1[1], 'o', color=color_text, markersize=12) plt.annotate(r'' + '(%.2f' % values_proba_qda_2[0] + ', %.2f' % values_proba_qda_2[1] + ', %.2f)' % values_proba_qda_2[2], xy=(display_2[0], display_2[1]), xycoords='data', color =color_text, xytext=(-150, -40), textcoords='offset points', fontsize=12, arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2", color=color_text))
N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform QDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = QDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) predictions = np.array(predictions) completeness, contamination = completeness_contamination(predictions, y_test) print "completeness", completeness print "contamination", contamination #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35)
model = LogisticRegression(multi_class='multinomial', solver='newton-cg', C=100) #create extended features xfeatures = np.concatenate((features, features[:, 0:1] * features[:, 1:2]), 1) y_pred = model.fit(xfeatures, labels[:, 0]).predict(xfeatures) y_pred = y_pred[:, np.newaxis] aux = (y_pred != labels) aux = np.sum(aux.astype(float), 0) misclassificationRate = aux / labels.size print(misclassificationRate) print('Nearest Neighbour') model = KNeighborsClassifier(n_neighbors=1, algorithm='brute') model.fit(features, labels[:, 0]) y_pred = model.predict(features) y_pred = y_pred[:, np.newaxis] aux = (y_pred != labels) aux = np.sum(aux.astype(float), 0) misclassificationRate = aux / labels.size print(misclassificationRate) print('Support Vector Machine') model = SVC(kernel='poly', degree=2, coef0=1.0, C=100) y_pred = model.fit(features, labels[:, 0]).predict(features) y_pred = y_pred[:, np.newaxis] aux = (y_pred != labels) aux = np.sum(aux.astype(float), 0) misclassificationRate = aux / labels.size print(misclassificationRate)