def create_symbol_forecast_model(self): # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series( self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5 ) # Use the prior two days of returns as predictor # values, with direction as the response x = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Create training and test sets, each of them is series start_test = self.model_start_test_date x_train = x[x.index < start_test] x_test = x[x.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] model = QuadraticDiscriminantAnalysis() model.fit(x_train, y_train) # return nd array pred_test = model.predict(x_test) print("Error Rate is {0}".format((y_test != pred_test).sum() * 1. / len(y_test))) return model
def train(self): if self._model_selection == "svm": # selected the svc in svm self._classifier = svm.SVC() elif self._model_selection == "nb": self._classifier = GaussianNB() elif self._model_selection == "knn": # parameter n_jobs can be set to -1 to enable parallel calculating self._classifier = KNeighborsClassifier(n_neighbors=7) elif self._model_selection == "ada": # Bunch of parameters, n_estimators, learning_rate self._classifier = AdaBoostClassifier() elif self._model_selection == "rf": # many parameters including n_jobs self._classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) elif self._model_selection == "qda": # complicated array like parameters, perhaps leave it default self._classifier = QuadraticDiscriminantAnalysis() else: print "Please refer to one classifier" self._classifier.fit(self._train_data, self._train_targets) # predict on valid data prediction_valid = self._classifier.predict(self._valid_data) # print validation result for selected model. print ( "Classification report for classifier %s on valid_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._valid_targets, prediction_valid)) )
class QuadraticDiscriminantAnalysiscls(object): """docstring for ClassName""" def __init__(self): self.qda_cls = QuadraticDiscriminantAnalysis() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.qda_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.qda_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.qda_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
class SNPForecastingStrategy(Strategy): """ Requires: symbol - A stock symbol on which to form a strategy on. bars - A DataFrame of bars for the above symbol.""" def __init__(self, symbol, bars): self.symbol = symbol self.bars = bars self.create_periods() self.fit_model() def create_periods(self): """Create training/test periods.""" self.start_train = datetime.datetime(2001,1,10) self.start_test = datetime.datetime(2005,1,1) self.end_period = datetime.datetime(2005,12,31) def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QuadraticDiscriminantAnalysis() self.model.fit(X_train, y_train) def generate_signals(self): """Returns the DataFrame of symbols containing the signals to go long, short or hold (1, -1 or 0).""" signals = pd.DataFrame(index=self.bars.index) signals['signal'] = 0.0 # Predict the subsequent period with the QDA model signals['signal'] = self.model.predict(self.predictors) # Remove the first five signal entries to eliminate # NaN issues with the signals DataFrame signals['signal'][0:5] = 0.0 signals['positions'] = signals['signal'].diff() return signals
def doQDA(x,digits,s): myLDA = LDA() myLDA.fit(x.PCA[:,:s],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:s,:]) labels = myLDA.predict(newtest) errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels) return errors
def confusion(digits): myLDA = LDA() x = center_matrix_SVD(digits.train_Images) myLDA.fit(x.PCA[:,:50],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:50,:]) labels = myLDA.predict(newtest) import sklearn.metrics as f print(f.confusion_matrix(digits.test_Labels,labels))
def test_qda_priors(): clf = QuadraticDiscriminantAnalysis() y_pred = clf.fit(X6, y6).predict(X6) n_pos = np.sum(y_pred == 2) neg = 1e-10 clf = QuadraticDiscriminantAnalysis(priors=np.array([neg, 1 - neg])) y_pred = clf.fit(X6, y6).predict(X6) n_pos2 = np.sum(y_pred == 2) assert_greater(n_pos2, n_pos)
def get_QDA(Xtrain, Ytrain, Xtest = None , Ytest = None, verbose = 0): qda = QDA() qda.fit(Xtrain,Ytrain) scores = np.empty((2)) if (verbose == 1): scores[0] = qda.score(Xtrain,Ytrain) print('QDA, train: {0:.02f}% '.format(scores[0]*100)) if (type(Xtest) != type(None)): scores[1] = qda.score(Xtest,Ytest) print('QDA, test: {0:.02f}% '.format(scores[1]*100)) return qda
def QD(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelQD=QuadraticDiscriminantAnalysis() modelQD.fit(train_desc,np.array(train_labels)) joblib.dump((modelQD, img_classes, stdSlr), pth+"/qd-bof.pkl", compress=3) test(pth, "qd-")
def crossValidate(attributes, outcomes, foldCount, ownFunction=True): presList =[]; recallList = [] accrList = []; fMeasList = [] aucList = [] testingEstimate = [] otcmVal = list(set(outcomes)) params = {}; featLen = 4; attrFolds = getFolds(attributes,foldCount) otcmFolds = getFolds(outcomes,foldCount) testDataList = copy.copy(attrFolds) testOtcmList = copy.copy(otcmFolds) for itr in range(foldCount): trainDataList = [] trainOtcmList = [] for intitr in range (foldCount): if intitr != itr: trainDataList.append(attrFolds[intitr]) trainOtcmList.append(otcmFolds[intitr]) trainDataArr = np.array(trainDataList).reshape(-1,featLen) trainOtcmArr = np.array(trainOtcmList).reshape(-1) testDataArr = np.array(testDataList[itr]).reshape(-1,featLen) testOtcmArr = np.array(testOtcmList[itr]).reshape(-1) if ownFunction: params = getParams(trainDataArr,trainOtcmArr,otcmVal,featLen) testingEstimate = gdaNDEstimate(testDataArr,params,otcmVal) else: #clf = LinearDiscriminantAnalysis() clf = QuadraticDiscriminantAnalysis() clf.fit(trainDataArr,trainOtcmArr) trainingEstimate = clf.predict(trainDataArr) testingEstimate = clf.predict(testDataArr) if itr == 0 and len(otcmVal)==2: addTitle = "Own" if ownFunction else "Inbuilt" metric = getMetrics(testOtcmArr,testingEstimate,otcmVal,showPlot=True,title="GDA2D Versicolor,Virginica - %s"%addTitle) else: metric = getMetrics(testOtcmArr,testingEstimate,otcmVal) accrList.append(metric[0]) presList.append(metric[1]) recallList.append(metric[2]) fMeasList.append(metric[3]) aucList.append(metric[4]) return accrList, presList, recallList, fMeasList, aucList
def test(): for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariances=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis') plt.show()
class QuadraticDiscriminantAnalysisPredictor(PredictorBase): ''' Quadratic Discriminant Analysis ''' def __init__(self): self.clf = QuadraticDiscriminantAnalysis() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def get_k_best_k(self): return 4
def test_qda_regularization(): # the default is reg_param=0. and will cause issues # when there is a constant variable clf = QuadraticDiscriminantAnalysis() with ignore_warnings(): y_pred = clf.fit(X2, y6).predict(X2) assert np.any(y_pred != y6) # adding a little regularization fixes the problem clf = QuadraticDiscriminantAnalysis(reg_param=0.01) with ignore_warnings(): clf.fit(X2, y6) y_pred = clf.predict(X2) assert_array_equal(y_pred, y6) # Case n_samples_in_a_class < n_features clf = QuadraticDiscriminantAnalysis(reg_param=0.1) with ignore_warnings(): clf.fit(X5, y5) y_pred5 = clf.predict(X5) assert_array_equal(y_pred5, y5)
def create_symbol_forecast_model(self): # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series( self.symbol_list[0], self.model_start_date, self.model_end_date, lags=5 ) # Use the prior two days of returns as predictor # values, with direction as the response X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] # Skip days with NaN skip_till_date = self.model_start_date + relativedelta(days=3) X = X[X.index > skip_till_date] y = y[y.index > skip_till_date] logging.debug(snpret[snpret.index > skip_till_date]) model = QDA() model.fit(X, y) return model
def set_up_classifier(self): historic_data = self.get_data() # Key is to identify a trend (use close for now) historic_data['return_5_timeframe'] = np.log(historic_data['Close'] / historic_data['Close'].shift(5)) * 100 historic_data.fillna(0.0001, inplace=True) historic_data['vol_normalised'] = normalise_data(historic_data['Volume']) # Bucket Return def bucket_return(x, col): if 0 < x[col] < 0.02: return 1 if 0.02 < x[col] < 0.1: return 2 if x[col] > 0.1: return 3 if 0 > x[col] > -0.02: return -1 if -0.02 > x[col] > -0.1: return -2 if x[col] < -0.1: return -3 else: return 0 historic_data['Return'] = historic_data.apply(bucket_return, axis=1, args=['return_5_timeframe']) historic_data['Move'] = historic_data['Close'] - historic_data['Open'] # X as predictor values, with Y as the response x = historic_data[["Move"]] y = historic_data["Return"] model = QuadraticDiscriminantAnalysis() model.fit(x, y) return model
def train_DA(self, X, y, lda_comp, qda_reg): ''' Input: qda_reg - reg_param lda_comp - n_components X - data matrix (train_num, feat_num) y - target labels matrix (train_num, label_num) Output: best_clf - best classifier trained (QDA/LDA) best_score - CV score of best classifier Find best DA classifier. ''' n_samples, n_feat = X.shape cv_folds = 10 kf = KFold(n_samples, cv_folds, shuffle=False) lda = LinearDiscriminantAnalysis(n_components = lda_comp) qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg) score_total_lda = 0 #running total of metric score over all cv runs score_total_qda = 0 #running total of metric score over all cv runs for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] lda.fit(X_train, y_train) cv_pred_lda = lda.predict(X_test) score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_lda += score_lda qda.fit(X_train,y_train) cv_pred_qda = qda.predict(X_test) score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_qda += score_qda score_lda = score_total_lda/cv_folds score_qda = score_total_qda/cv_folds # We keep the best one if(score_qda > score_lda): qda.fit(X,y) return qda, score_qda else: lda.fit(X,y) return lda, score_lda
def fit_model(self): """Fits a Quadratic Discriminant Analyser to the US stock market index (^GPSC in Yahoo).""" # Create a lagged series of the S&P500 US stock market index snpret = create_lagged_series(self.symbol, self.start_train, self.end_period, lags=5) # Use the prior two days of returns as # predictor values, with direction as the response X = snpret[["Lag1","Lag2"]] y = snpret["Direction"] # Create training and test sets X_train = X[X.index < self.start_test] y_train = y[y.index < self.start_test] # Create the predicting factors for use # in direction forecasting self.predictors = X[X.index >= self.start_test] # Create the Quadratic Discriminant Analysis model # and the forecasting strategy self.model = QuadraticDiscriminantAnalysis() self.model.fit(X_train, y_train)
def test_qda(): # QDA classification. # This checks that QDA implements fit and predict and returns # correct values for a simple toy dataset. clf = QuadraticDiscriminantAnalysis() y_pred = clf.fit(X6, y6).predict(X6) assert_array_equal(y_pred, y6) # Assure that it works with 1D data y_pred1 = clf.fit(X7, y6).predict(X7) assert_array_equal(y_pred1, y6) # Test probas estimates y_proba_pred1 = clf.predict_proba(X7) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y6) y_log_proba_pred1 = clf.predict_log_proba(X7) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8) y_pred3 = clf.fit(X6, y7).predict(X6) # QDA shouldn't be able to separate those assert np.any(y_pred3 != y7) # Classes should have at least 2 elements assert_raises(ValueError, clf.fit, X6, y4)
from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression classifiers = [[KNeighborsClassifier(3), 'KNN'], [SVC(probability=True), 'SVC'], [DecisionTreeClassifier(), 'Decision Tree'], [RandomForestClassifier(), 'Random Forest'], [AdaBoostClassifier(), 'ADA booster'], [GradientBoostingClassifier(), 'Gradient Booster'], [GaussianNB(), 'Gaussian Nb'], [LinearDiscriminantAnalysis(), 'Linear Discriminant Analysis'], [QuadraticDiscriminantAnalysis(), 'Quadratic Discrimination'], [LogisticRegression(), 'Logistic Regression']] X = train.drop("Survived", axis=1) y = train["Survived"] X_test = test scores = [] for clf in classifiers: clf = clf[0] clf.fit(X, y) y_pred = clf.predict(X_test)
X = training.iloc[:,1:-1].values y = training['country_destination'].values """ # Use Discriminant Analysis from sklearn.discriminant_analysis import LinearDiscriminantAnalysis trans = LinearDiscriminantAnalysis(n_components=3) trans.fit(X,y) X = trans.transform(X) """ # Split Up Data x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size=0.3,random_state=None) # Train classifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf = QuadraticDiscriminantAnalysis(reg_param=0.00001) clf.fit(x_train,y_train) # Run Predictions from sklearn.metrics import confusion_matrix, accuracy_score y_preds = clf.predict(x_valid) print( confusion_matrix(y_valid,y_preds) ); print( "Accuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f = open('qda_take1.txt', 'w') f.write( str(confusion_matrix(y_valid,y_preds)) ); f.write( "\nAccuracy: %f" % (accuracy_score(y_valid,y_preds)) ); f.write( "\nclf = QuadraticDiscriminantAnalysis(0.00001)" ); # Now on to final submission x_final = testing.iloc[:,1:].values y_final = clf.predict(x_final).reshape([62096,]);
import window_s_p_ft as win from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.cross_validation import train_test_split total_score = 0 stop = 1000 for x in range(stop): clf = QuadraticDiscriminantAnalysis() data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec for s in data_train] data_test_labels = [s.spec for s in data_test] data_train = [s.grades for s in data_train] data_test = [s.grades for s in data_test] clf.fit(data_train, data_train_labels) total_score += clf.score(data_test, data_test_labels) total_score = total_score / stop print("all") print(total_score) specs = ["FK", "FM", "MN", "OE"] for sp in specs: total_score = 0 for x in range(stop): clf = QuadraticDiscriminantAnalysis() data = win.getStudents() data_train, data_test = train_test_split(data, test_size=0.2) data_train_labels = [s.spec if s.spec == sp else "NOT " + sp for s in data_train] data_test_labels = [s.spec if s.spec == sp else "NOT " + sp for s in data_test] data_train = [s.grades for s in data_train]
# ########################################################################### # get training, validation and test datasets for specified roi training_data, validation_data, test_data = ds.split_data() ########################################################################### # # CREATE MODEL # ########################################################################### # Define the estimator: quadratic discriminant analysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis() qda.fit(training_data[0], training_data[1]) from sklearn.metrics import accuracy_score # record the best result accuracies[i] = accuracy_score(test_data[1], qda.predict(test_data[0])) mean_accuracy = accuracies.mean() print("\n\nmean accuracy: %f" % mean_accuracy) ############################################################################### # # VISUALIZE
pyplot.show() # Question 2B # Reprenons les paires de mesures, mais entrainons cette fois # differents modeles demandes avec chaque paire for (f1, f2) in pairs: # TODO Q2B # Creez ici un sous-dataset contenant seulement les # mesures designees par f1 et f2 subData = data.data[:, (f1, f2)] # TODO Q2B # Initialisez ici les differents classifieurs, dans # une liste nommee "classifieurs" classifieurs = [ QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), GaussianNB(), NearestCentroid() ] # TODO Q2B # Creez ici une grille permettant d'afficher les regions de # decision pour chaque classifieur # Indice : numpy.meshgrid pourrait vous etre utile ici # N'utilisez pas un pas trop petit! x = numpy.arange(min(subData[:, 0]), max(subData[:, 0]), 0.05) y = numpy.arange(min(subData[:, 1]), max(subData[:, 1]), 0.05) xx, yy = numpy.meshgrid(x, y)
train_x_std = train_x cv_x_std = cv_x lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) lda.fit(train_x_std, train_y) train_preds_lda = lda.predict(train_x_std) cv_preds_lda = lda.predict(cv_x_std) train_acc_lda = accuracy_score(train_preds_lda, train_y) cv_acc_lda = accuracy_score(cv_preds_lda, cv_y) print("Training accuracy for linear discriminant analysis is ", train_acc_lda) print("CV accuracy for linear discriminant analysis is ", cv_acc_lda) qda = QuadraticDiscriminantAnalysis(store_covariance=True) qda.fit(train_x_std, train_y) train_preds_qda = qda.predict(train_x_std) cv_preds_qda = qda.predict(cv_x_std) train_acc_qda = accuracy_score(train_preds_qda, train_y) cv_acc_qda = accuracy_score(cv_preds_qda, cv_y) print("Training accuracy for Quadratic discriminant analysis is ", train_acc_qda) print("CV accuracy for Quadratic discriminant analysis is ", cv_acc_qda) logReg = linear_model.LogisticRegression(C=1) logReg.fit(train_x_std, train_y) #Value of C=1 was obtained after some tries
def main(): # Checks for correct number of arguments if len(sys.argv) != 3: print( 'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]') sys.exit() # set up dataset data_train = pd.read_csv(sys.argv[1]) data_test = pd.read_csv(sys.argv[2]) print('train: {}'.format(sys.argv[1])) print('test: {}'.format(sys.argv[2])) x_train = data_train.drop( [data_train.columns[0], data_train.columns[1], data_train.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_train = pd.Series(data_train.iloc[:, -1]) x_test = data_test.drop( [data_test.columns[0], data_test.columns[1], data_test.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_test = pd.Series(data_test.iloc[:, -1]) type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ') if type == 1: method = input('method: [1: classification, 2: regression] ') if method == 1: classifier = input( 'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] ' ) if classifier == 1: criterion = input('criterion: [1: gini, 2: entropy] ') if criterion == 1: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='gini') elif criterion == 2: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='entropy') else: print('no criterion chosen') exit() elif classifier == 2: print(type, method, classifier) model = ExtraTreeClassifier() elif classifier == 3: print(type, method, classifier) model = ExtraTreesClassifier() elif classifier == 4: n = input('n: [1: 1, 2: 3: 3: 5] ') if n == 1: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=1) elif n == 2: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=3) elif n == 3: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=5) else: print('no n chosen') exit() elif classifier == 5: version = input( 'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] ' ) if version == 1: print(type, method, classifier, version) model = GaussianNB() elif version == 2: print(type, method, classifier, version) model = BernoulliNB() elif version == 3: print(type, method, classifier, version) model = MultinomialNB() elif version == 4: print(type, method, classifier, version) model = ComplementNB() else: print('no version chosen') exit() elif classifier == 6: print(type, method, classifier) model = RadiusNeighborsClassifier(radius=1.0) elif classifier == 7: print(type, method, classifier) model = RandomForestClassifier(n_estimators=50, random_state=1) elif classifier == 8: print(type, method, classifier) model = LinearSVC( multi_class='crammer_singer') #multi_class='ovr' elif classifier == 9: print(type, method, classifier) model = GradientBoostingClassifier() elif classifier == 10: print(type, method, classifier) model = GaussianProcessClassifier(multi_class='one_vs_one') # model = GaussianProcessClassifier(multi_class='one_vs_rest') elif classifier == 11: print(type, method, classifier) model = SGDClassifier() elif classifier == 12: print(type, method, classifier) model = PassiveAggressiveClassifier() elif classifier == 13: print(type, method, classifier) model = NearestCentroid() elif classifier == 14: print(type, method, classifier) model = Perceptron(tol=1e-3, random_state=0) elif classifier == 15: print(type, method, classifier) model = MLPClassifier() elif classifier == 16: print(type, method, classifier) model = AdaBoostClassifier(n_estimators=100) else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) filename = '{},{},{}.txt'.format(type, method, classifier) with open(filename, 'w') as output: output.write('{:10}\t{:10}\t{:10}\t{:10}'.format( 'actual', 'predict', 'approximate', 'match?')) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False output.write('{:10}\t{:10}\t{:10}'.format( y_train[i], predictions[i], match)) output.write('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print( classification_report( y_test, predictions, target_names=['RightTroll', 'LeftTroll', 'Other'])) print( confusion_matrix(y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"])) elif method == 2: # transform into binary classification problem # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1) # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1) # transform string labels into integers # le = LabelEncoder() # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1])) # print(le.classes_) # # y_train = le.transform(y_train) # y_test = le.transform(y_test) regressor = input( 'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] ' ) if regressor == 1: print(type, method, regressor) model = LinearDiscriminantAnalysis() elif regressor == 2: print(type, method, regressor) model = LogisticRegression( solver='lbfgs', multi_class='multinomial') #'newton-cg' elif regressor == 3: print(type, method, regressor) model = RidgeClassifier() elif regressor == 4: print(type, method, regressor) model = QuadraticDiscriminantAnalysis() elif regressor == 5: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(LinearRegression()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(LinearRegression()) else: print('no strategy selected') exit() elif regressor == 6: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(DecisionTreeRegressor()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(DecisionTreeRegressor()) else: print('no strategy selected') exit() elif regressor == 7: print(type, method, regressor) model = PLSRegression(n_components=2) elif regressor == 8: print(type, method, regressor) model = PLSCanonical(n_components=2) elif regressor == 9: print(type, method, regressor) model = CCA(n_components=1) elif regressor == 10: print(type, method, regressor) model = Lasso(alpha=0.1) elif regressor == 11: print(type, method, regressor) model = MultiTaskLasso(alpha=0.1) elif regressor == 12: print(type, method, regressor) model = ElasticNet(random_state=0) elif regressor == 13: print(type, method, regressor) model = MultiTaskElasticNet(random_state=0) elif regressor == 14: print(type, method, regressor) model = Lars(n_nonzero_coefs=1) elif regressor == 15: print(type, method, regressor) model = LassoLars(alpha=.1) elif regressor == 16: print(type, method, regressor) model = OrthogonalMatchingPursuit() elif regressor == 17: print(type, method, regressor) model = BayesianRidge() elif regressor == 18: print(type, method, regressor) model = ARDRegression() elif regressor == 19: print(type, method, regressor) model = TheilSenRegressor(random_state=0) elif regressor == 20: print(type, method, regressor) model = HuberRegressor() elif regressor == 21: print(type, method, regressor) model = RANSACRegressor(random_state=0) else: print('no regressor chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # print('coefficient:', model.coef_) # print('intercept:', model.intercept_) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no method chosen') exit() elif type == 2: classifier = input( 'classifier: [1: label propagation, 2: label spreading] ') if classifier == 1: print(type, classifier) model = LabelPropagation() elif classifier == 2: print(type, classifier) model = LabelSpreading() else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) elif type == 3: method = input( 'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] ' ) if method == 1: clusterer = input('clustere: [1: k means]') if clusterer == 1: clusters = input('clusters: [1: 1, 2: 2, 3: 3] ') if clusters == 1: print(type, method, clusters) model = KMeans(n_clusters=1, random_state=0) elif clusters == 2: print(type, method, clusters) model = KMeans(n_clusters=2, random_state=0) elif clusters == 3: print(type, method, clusters) model = KMeans(n_clusters=3, random_state=0) else: print('no clusters chosen') exit() else: print('no clusterer chosen') exit() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.predict(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # check details print('centroids: ' + model.cluster_centers_) # print('labels: ' + model.labels_) elif method == 2: model = RandomTreesEmbedding() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.apply(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) elif method == 3: model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree') # train the model using the training sets and check score model.fit(x_train) distances, indices = nbrs.kneighbors(X) else: print('no method chosen') exit() # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no type chosen') exit()
df = pd.DataFrame() for i in range(1, 21): #Train test split X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, stratify=public_labels, random_state=i*500) #Vettorizzare i label train_labels_encoded = encoder.fit_transform(y_train) test_labels_encoded = encoder.transform(y_test) #QuadraticDiscriminantAnalysis steps = [('scaler', MinMaxScaler()), ('clf', QuadraticDiscriminantAnalysis())] pipeline = Pipeline(steps) parameteres = [{'scaler':scalers_to_test}] grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5, n_jobs=-1, verbose=1) grid.fit(X_train, y_train) score_train = grid.score(X_train, y_train) score_test = grid.score(X_test, y_test) best_p = grid.best_params_ bp = pd.DataFrame(best_p, index=[i]) bp['accuracy_train'] = score_train
import numpy as np from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # Importing the Iris dataset # Dataset 1 - Iris Setosa # 2 - Iris Versicolour # 3 - Iris Virginica samples = np.loadtxt('../../Dataset/iris/iris.data', delimiter=',') # Extracting only Petal length and width X = samples[:, 2:4] y = samples[:, 4] # Performing LDA lda = LDA() lda.fit(X, y) # Performing QDA qda = QDA() qda.fit(X, y) reg_param = [0.001, 0.01, 0.1, 1, 10] for r in reg_param: qda = QDA(reg_param=r) qda.fit(X, y)
def quadratic_discriminant(x_train, y_train, x_test, y_test): clf = QuadraticDiscriminantAnalysis() return __fit_clf_model('quadratic_discriminant', clf, x_train, y_train, x_test, y_test)
def train_classifiers(X, y, comment): """"" Main function for training Parameters: ---------- X np.array Features y np.array Class labels comment String Comment for ROC Plot Returns: --------- df pd.DataFrame Trained Hyperparameter scoring """ "" from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.neural_network import MLPClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import AdaBoostClassifier models = { 'SVC': SVC(probability=True), 'GaussianNB': GaussianNB(), 'LogisticRegression': LogisticRegression(), 'MLPClassifier': MLPClassifier(), 'KNeighborsClassifier': KNeighborsClassifier(), 'XGBClassifier': XGBClassifier(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'RandomForestClassifier': RandomForestClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier() } tuned_parameters = { 'SVC': { 'kernel': ['rbf', 'linear'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000, 10000] }, 'GaussianNB': {}, 'LogisticRegression': { 'C': [0.1, 0.5, 1, 10, 100] }, 'MLPClassifier': [{ 'solver': ['lbfgs', 'sgd', 'adam'] }, { 'alpha': [0.0001, 0.00001, 0.0010] }], 'KNeighborsClassifier': { 'n_neighbors': [5, 10, 15, 20] }, 'XGBClassifier': {}, 'QuadraticDiscriminantAnalysis': {}, 'ExtraTreesClassifier': { 'n_estimators': [16, 32] }, 'RandomForestClassifier': [{ 'n_estimators': [16, 32] }, { 'criterion': ['gini', 'entropy'], 'n_estimators': [8, 16] }], 'AdaBoostClassifier': { 'n_estimators': [16, 32] }, 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] } } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) trainer = ParameterEstimator(models, tuned_parameters) trainer.fit(X_train, y_train, scoring='f1', n_jobs=2) trainer.fit_test(X_test, y_test, X_train, y_train, trainer.score(), comment) return trainer.score()
classifier = AdaBoostClassifier() elif model_name == model_names[3]: print('Using Random Forest Classifier.') classifier = RandomForestClassifier(class_weight='balanced') elif model_name == model_names[4]: print('Using Naive Bayes Classifier.') classifier = GaussianNB() elif model_name == model_names[5]: print('Using Support Vector Machines Classifier.') classifier = SVC(probability=True, class_weight='balanced') elif model_name == model_names[6]: print('Using Linear Discriminant Analysis.') classifier = LinearDiscriminantAnalysis() elif model_name == model_names[7]: print('Using Quadratic Discriminant Analysis.') classifier = QuadraticDiscriminantAnalysis() else: print('Unknown option for classifier, using naive bayes as default one!') model_name = 'naive_bayes' classifier = GaussianNB() # Feature Scaling sc = StandardScaler() print('Using Repeated Stratified K-Fold Cross Validation.\n') # Using Repeated Stratified K-Fold Cross Validation rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=36851234) accuracy_list = [] pr_auc_list = [] roc_auc_list = []
def learn(self, fname): t = time.time() # load CSV file self.header = [] rows = [] naming_num = 0 with open(fname, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for i, row in enumerate(reader): if i == 0: self.header = row else: for j, val in enumerate(row): if val == '': row[j] = 0 continue try: row[j] = float(val) except: if val not in self.naming['from']: self.naming['from'][val] = naming_num self.naming['to'][naming_num] = val naming_num += 1 row[j] = self.naming['from'][val] rows.append(row) # first column in row is the classification, Y y = numpy.zeros(len(rows)) x = numpy.zeros((len(rows), len(rows[0]) - 1)) # shuffle it up for training record_range = list(range(len(rows))) shuffle(record_range) for i in record_range: y[i] = rows[i][0] x[i, :] = numpy.array(rows[i][1:]) names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", # "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True), SVC(gamma=2, C=1, probability=True), # GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), # GaussianProcess takes too long! DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] self.algorithms = {} # split_for_learning = int(0.70 * len(y)) for name, clf in zip(names, classifiers): t2 = time.time() self.algorithms[name] = clf try: self.algorithms[name].fit(x, y) # self.algorithms[name].fit(x[:split_for_learning], # y[:split_for_learning]) # score = self.algorithms[name].score(x[split_for_learning:], y[ # split_for_learning:]) # print(name, score) except: pass self.logger.debug("learned {}, {:d} ms".format( name, int(1000 * (t2 - time.time())))) t2 = time.time() name = "Extended Naive Bayes" clf = ExtendedNaiveBayes(self.family, path_to_data=self.path_to_data) clf.fit(fname) self.logger.debug("learned {}, {:d} ms".format( name, int(1000 * (t2 - time.time())))) self.logger.debug("{:d} ms".format(int(1000 * (t - time.time()))))
def main(): """Run experiment with multiple classifiers.""" # Get classifiers classifiers = [ ('Decision Tree', DecisionTreeClassifier(max_depth=5)), ('Random Forest', RandomForestClassifier(n_estimators=50, n_jobs=10, max_features=50)), ('AdaBoost', AdaBoostClassifier()), ('Naive Bayes', GaussianNB()), ('LDA', LinearDiscriminantAnalysis()), ('QDA', QuadraticDiscriminantAnalysis()), # ('Random Forest 2', RandomForestClassifier(max_depth=5, # n_estimators=10, # max_features=1, # n_jobs=10)), # ('Logistic Regression (C=1)', LogisticRegression(C=1)), # #('Logistic Regression (C=1000)', LogisticRegression(C=10000)), # ('RBM 200, n_iter=40, LR=0.01, Reg: C=1', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=200, # n_iter=40, # learning_rate=0.01, # verbose=True)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 200, n_iter=40, LR=0.01, Reg: C=10000', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=200, # n_iter=40, # learning_rate=0.01, # verbose=True)), # ('logistic', LogisticRegression(C=10000))])), # ('RBM 100', Pipeline(steps=[('rbm', BernoulliRBM(n_components=100)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 100, n_iter=20', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, n_iter=20)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 256', Pipeline(steps=[('rbm', BernoulliRBM(n_components=256)), # ('logistic', LogisticRegression(C=1))])), # ('RBM 512, n_iter=100', # Pipeline(steps=[('rbm', BernoulliRBM(n_components=512, n_iter=10)), # ('logistic', LogisticRegression(C=1))])), # ('NN 20:5', skflow.TensorFlowDNNClassifier(hidden_units=[20, 5], # n_classes=data['n_classes'], # steps=500)), # ('NN 500:200 dropout', # ('CNN', skflow.TensorFlowEstimator(model_fn=conv_model, # n_classes=10, # batch_size=100, # steps=20000, # learning_rate=0.001)), # ('SVM, adj.', SVC(probability=False, # kernel="rbf", # C=2.8, # gamma=.0073, # cache_size=200)), # ('SVM, linear', SVC(kernel="linear", C=0.025, cache_size=200)), # ('k nn', KNeighborsClassifier(3)), # ('Gradient Boosting', GradientBoostingClassifier()) ] data = get_data('hasy') # Fit them all classifier_data = {} with open('classifier-comp.md', 'w') as f: for clf_name, clf in classifiers: print(clf_name) classifier_data[clf_name] = [] f.write("#" * 80) f.write("\n") f.write("Start fitting '%s' classifier.\n" % clf_name) for fold in range(len(data)): print(data[fold]['test']['X'].shape) print(data[fold]['test']['y'].shape) print("Got %i training samples and %i test samples." % (len( data[fold]['train']['X']), len(data[fold]['test']['X']))) t0 = time.time() examples = 10**9 clf.fit(data[fold]['train']['X'][:examples], data[fold]['train']['y'][:examples]) t1 = time.time() an_data = analyze(clf, data[fold], t1 - t0, clf_name=clf_name, handle=f) classifier_data[clf_name].append({ 'training_time': t1 - t0, 'testing_time': an_data['testing_time'], 'accuracy': an_data['accuracy'] }) pretty_print(classifier_data)
splot.set_xticks(()) splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') ############################################################################### for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis() y_pred = qda.fit(X, y, store_covariances=True).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis') plt.show()
import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import accuracy_score, log_loss from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression classifier = [ KNeighborsClassifier(3), SVC(probability= True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()] log = pd.DataFrame(columns=["Classifier", "Accuracy"]) stshsp = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) a = train[0::, 1::] b = train[0::, 0] acct_dict = {} for train_index, test_index in stshsp.split(a,b): a_train, a_test = a[train_index], a[test_index] b_train, b_test = b[train_index], b[test_index]
def __new__(self, y, clf='lda', kern='rbf', n_knn=10, n_tree=100, priors=False, **kwargs): # Use a pre-defined classifier : if isinstance(clf, (str, int)): # Default value for priors : priors = np.array([1 / len(np.unique(y))] * len(np.unique(y))) if isinstance(clf, str): clf = clf.lower() # LDA : if clf == 'lda' or clf == 0: clfObj = LinearDiscriminantAnalysis(priors=priors, **kwargs) clfObj.lgStr = 'Linear Discriminant Analysis' clfObj.shStr = 'LDA' # SVM : ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable elif clf == 'svm' or clf == 1: clfObj = SVC(kernel=kern, probability=True, **kwargs) clfObj.lgStr = 'Support Vector Machine (kernel=' + kern + ')' clfObj.shStr = 'SVM-' + kern # Linear SVM: elif clf == 'linearsvm' or clf == 2: clfObj = LinearSVC(**kwargs) clfObj.lgStr = 'Linear Support Vector Machine' clfObj.shStr = 'LSVM' # Nu SVM : elif clf == 'nusvm' or clf == 3: clfObj = NuSVC(**kwargs) clfObj.lgStr = 'Nu Support Vector Machine' clfObj.shStr = 'NuSVM' # Naive Bayesian : elif clf == 'nb' or clf == 4: clfObj = GaussianNB(**kwargs) clfObj.lgStr = 'Naive Baysian' clfObj.shStr = 'NB' # KNN : elif clf == 'knn' or clf == 5: clfObj = KNeighborsClassifier(n_neighbors=n_knn, **kwargs) clfObj.lgStr = 'k-Nearest Neighbor (neighbor=' + str( n_knn) + ')' clfObj.shStr = 'KNN-' + str(n_knn) # Random forest : elif clf == 'rf' or clf == 6: clfObj = RandomForestClassifier(n_estimators=n_tree, **kwargs) clfObj.lgStr = 'Random Forest (tree=' + str(n_tree) + ')' clfObj.shStr = 'RF-' + str(n_tree) # Logistic regression : elif clf == 'lr' or clf == 7: clfObj = LogisticRegression(**kwargs) clfObj.lgStr = 'Logistic Regression' clfObj.shStr = 'LogReg' # QDA : elif clf == 'qda' or clf == 8: clfObj = QuadraticDiscriminantAnalysis(**kwargs) clfObj.lgStr = 'Quadratic Discriminant Analysis' clfObj.shStr = 'QDA' else: raise ValueError('No classifier "' + str(clf) + '"" found') # Use a custom classifier : else: clfObj = clf clfObj.shStr = 'custom' clfObj.lgStr = 'Custom classifier' return clfObj
log_model = lr_model if log == 'nb': from sklearn.naive_bayes import GaussianNB lr_model = GaussianNB() log_model = lr_model if log == 'knn': from sklearn.neighbors import KNeighborsClassifier lr_model = KNeighborsClassifier(n_neighbors=35, weights='distance') log_model = lr_model if log == 'lda': from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lr_model = LinearDiscriminantAnalysis() log_model = lr_model if log == 'qda': from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis lr_model = QuadraticDiscriminantAnalysis() log_model = lr_model # Run CV for i, (train_index, test_index) in enumerate(kf.split(X)): # Create data for this fold y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index] X_train, X_valid = X.iloc[train_index, :].copy(), X.iloc[ test_index, :].copy() X_test = test_df.copy() print("\nFold ", i) # Run model for this fold fit_model = log_model.fit(X_train, y_train)
def save_qda(X, y): qda = QDA().fit(X, y) y_pred = qda.predict(X) f = open("./results/Qda.txt", "w") f.write(get_model_report(y, y_pred))
"Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA", "Logistic Regression", "Logistic Regression CV" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025, probability=True), SVC(gamma=2, C=1, probability=True), GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis(), LogisticRegression(), LogisticRegressionCV() ] def merge_row(nparr1, nparr2): return np.append(nparr1, nparr2, axis=1) def average(nparr1, nparr2, r=(1.0, 1.0)): return (r[0] * nparr1) + (r[1] * nparr2) merge_types = ["merge", "average"] merge_funcs = [merge_row, average]
def qda_classifier(Xtrain, ytrain, Xtest, ytest): clf = QuadraticDiscriminantAnalysis() clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtest) return evaluator('QDA', ypred, ytest)
X = X[y > 0] y = y[y > 0] y -= 1 target_names = iris.target_names[1:] ################################################################################ # LDA lda = LDA() print(X,"JJJJ") print(y,"UUUUU") y_pred = lda.fit(X, y) print(lda.coef_) print(lda.intercept_) # QDA qda = QDA() y_pred = qda.fit(X, y).predict(X) ############################################################################### # Plot results def plot_ellipse(splot, mean, cov, color): v, w = linalg.eigh(cov) u = w[0] / linalg.norm(w[0]) angle = np.arctan(u[1]/u[0]) angle = 180 * angle / np.pi # convert to degrees # filled gaussian at 2 standard deviation ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5, 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5)
def discriminatePlot(X, y, cVal, titleStr=''): # Frederic's Robust Wrapper for discriminant analysis function. Performs lda, qda and RF afer error checking, # Generates nice plots and returns cross-validated # performance, stderr and base line. # X np array n rows x p parameters # y group labels n rows # rgb color code for each data point - should be the same for each data beloging to the same group # titleStr title for plots # returns: ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses # Global Parameters CVFOLDS = 10 MINCOUNT = 10 MINCOUNTTRAINING = 5 # Initialize Variables and clean up data classes, classesCount = np.unique(y, return_counts = True) # Classes to be discriminated should be same as ldaMod.classes_ goodIndClasses = np.array([n >= MINCOUNT for n in classesCount]) goodInd = np.array([b in classes[goodIndClasses] for b in y]) yGood = y[goodInd] XGood = X[goodInd] cValGood = cVal[goodInd] classes, classesCount = np.unique(yGood, return_counts = True) nClasses = classes.size # Number of classes or groups # Do we have enough data? if (nClasses < 2): print 'Error in ldaPLot: Insufficient classes with minimun data (%d) for discrimination analysis' % (MINCOUNT) return -1, -1, -1, -1 , -1, -1, -1 cvFolds = min(min(classesCount), CVFOLDS) if (cvFolds < CVFOLDS): print 'Warning in ldaPlot: Cross-validation performed with %d folds (instead of %d)' % (cvFolds, CVFOLDS) # Data size and color values nD = XGood.shape[1] # number of features in X nX = XGood.shape[0] # number of data points in X cClasses = [] # Color code for each class for cl in classes: icl = (yGood == cl).nonzero()[0][0] cClasses.append(np.append(cValGood[icl],1.0)) cClasses = np.asarray(cClasses) myPrior = np.ones(nClasses)*(1.0/nClasses) # Perform a PCA for dimensionality reduction so that the covariance matrix can be fitted. nDmax = int(np.fix(np.sqrt(nX/5))) if nDmax < nD: print 'Warning: Insufficient data for', nD, 'parameters. PCA projection to', nDmax, 'dimensions.' nDmax = min(nD, nDmax) pca = PCA(n_components=nDmax) Xr = pca.fit_transform(XGood) print 'Variance explained is %.2f%%' % (sum(pca.explained_variance_ratio_)*100.0) # Initialise Classifiers ldaMod = LDA(n_components = min(nDmax,nClasses-1), priors = myPrior, shrinkage = None, solver = 'svd') qdaMod = QDA(priors = myPrior) rfMod = RF() # by default assumes equal weights # Perform CVFOLDS fold cross-validation to get performance of classifiers. ldaScores = np.zeros(cvFolds) qdaScores = np.zeros(cvFolds) rfScores = np.zeros(cvFolds) skf = cross_validation.StratifiedKFold(yGood, cvFolds) iskf = 0 for train, test in skf: # Enforce the MINCOUNT in each class for Training trainClasses, trainCount = np.unique(yGood[train], return_counts=True) goodIndClasses = np.array([n >= MINCOUNTTRAINING for n in trainCount]) goodIndTrain = np.array([b in trainClasses[goodIndClasses] for b in yGood[train]]) # Specity the training data set, the number of groups and priors yTrain = yGood[train[goodIndTrain]] XrTrain = Xr[train[goodIndTrain]] trainClasses, trainCount = np.unique(yTrain, return_counts=True) ntrainClasses = trainClasses.size # Skip this cross-validation fold because of insufficient data if ntrainClasses < 2: continue goodInd = np.array([b in trainClasses for b in yGood[test]]) if (goodInd.size == 0): continue # Fit the data trainPriors = np.ones(ntrainClasses)*(1.0/ntrainClasses) ldaMod.priors = trainPriors qdaMod.priors = trainPriors ldaMod.fit(XrTrain, yTrain) qdaMod.fit(XrTrain, yTrain) rfMod.fit(XrTrain, yTrain) ldaScores[iskf] = ldaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) qdaScores[iskf] = qdaMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) rfScores[iskf] = rfMod.score(Xr[test[goodInd]], yGood[test[goodInd]]) iskf += 1 if (iskf != cvFolds): cvFolds = iskf ldaScores.reshape(cvFolds) qdaScores.reshape(cvFolds) rfScores.reshape(cvFolds) # Refit with all the data for the plots ldaMod.priors = myPrior qdaMod.priors = myPrior Xrr = ldaMod.fit_transform(Xr, yGood) # Check labels for a, b in zip(classes, ldaMod.classes_): if a != b: print 'Error in ldaPlot: labels do not match' # Print the coefficients of first 3 DFA print 'LDA Weights:' print 'DFA1:', ldaMod.coef_[0,:] if nClasses > 2: print 'DFA2:', ldaMod.coef_[1,:] if nClasses > 3: print 'DFA3:', ldaMod.coef_[2,:] # Obtain fits in this rotated space for display purposes ldaMod.fit(Xrr, yGood) qdaMod.fit(Xrr, yGood) rfMod.fit(Xrr, yGood) XrrMean = Xrr.mean(0) # Make a mesh for plotting x1, x2 = np.meshgrid(np.arange(-6.0, 6.0, 0.1), np.arange(-6.0, 6.0, 0.1)) xm1 = np.reshape(x1, -1) xm2 = np.reshape(x2, -1) nxm = np.size(xm1) Xm = np.zeros((nxm, Xrr.shape[1])) Xm[:,0] = xm1 if Xrr.shape[1] > 1 : Xm[:,1] = xm2 for ix in range(2,Xrr.shape[1]): Xm[:,ix] = np.squeeze(np.ones((nxm,1)))*XrrMean[ix] XmcLDA = np.zeros((nxm, 4)) # RGBA values for color for LDA XmcQDA = np.zeros((nxm, 4)) # RGBA values for color for QDA XmcRF = np.zeros((nxm, 4)) # RGBA values for color for RF # Predict values on mesh for plotting based on the first two DFs yPredLDA = ldaMod.predict_proba(Xm) yPredQDA = qdaMod.predict_proba(Xm) yPredRF = rfMod.predict_proba(Xm) # Transform the predictions in color codes maxLDA = yPredLDA.max() for ix in range(nxm) : cWeight = yPredLDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcLDA[ix,:] = np.dot(cWinner, cClasses) XmcLDA[ix,3] = cWeight.max()/maxLDA # Plot the surface of probability plt.figure(facecolor='white', figsize=(10,3)) plt.subplot(131) Zplot = XmcLDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: LDA pC %.0f %%' % (titleStr, (ldaScores.mean()*100.0))) plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.xlabel('DFA 1') plt.ylabel('DFA 2') # Transform the predictions in color codes maxQDA = yPredQDA.max() for ix in range(nxm) : cWeight = yPredQDA[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses XmcQDA[ix,:] = np.dot(cWinner, cClasses) XmcQDA[ix,3] = cWeight.max()/maxQDA # Plot the surface of probability plt.subplot(132) Zplot = XmcQDA.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: QDA pC %.0f %%' % (titleStr, (qdaScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) # Transform the predictions in color codes maxRF = yPredRF.max() for ix in range(nxm) : cWeight = yPredRF[ix,:] # Prob for all classes cWinner = ((cWeight == cWeight.max()).astype('float')) # Winner takes all # XmcLDA[ix,:] = np.dot(cWeight, cClasses)/nClasses # Weighted colors does not work XmcRF[ix,:] = np.dot(cWinner, cClasses) XmcRF[ix,3] = cWeight.max()/maxRF # Plot the surface of probability plt.subplot(133) Zplot = XmcRF.reshape(np.shape(x1)[0], np.shape(x1)[1],4) plt.imshow(Zplot, zorder=0, extent=[-6, 6, -6, 6], origin='lower', interpolation='none', aspect='auto') if nClasses > 2: plt.scatter(Xrr[:,0], Xrr[:,1], c=cValGood, s=40, zorder=1) else: plt.scatter(Xrr,(np.random.rand(Xrr.size)-0.5)*12.0 , c=cValGood, s=40, zorder=1) plt.title('%s: RF pC %.0f %%' % (titleStr, (rfScores.mean()*100.0))) plt.xlabel('DFA 1') plt.ylabel('DFA 2') plt.axis('square') plt.xlim((-6, 6)) plt.ylim((-6, 6)) plt.show() # Results ldaScore = ldaScores.mean()*100.0 qdaScore = qdaScores.mean()*100.0 rfScore = rfScores.mean()*100.0 ldaScoreSE = ldaScores.std() * 100.0 qdaScoreSE = qdaScores.std() * 100.0 rfScoreSE = rfScores.std() * 100.0 print ("Number of classes %d. Chance level %.2f %%") % (nClasses, 100.0/nClasses) print ("%s LDA: %.2f (+/- %0.2f) %%") % (titleStr, ldaScore, ldaScoreSE) print ("%s QDA: %.2f (+/- %0.2f) %%") % (titleStr, qdaScore, qdaScoreSE) print ("%s RF: %.2f (+/- %0.2f) %%") % (titleStr, rfScore, rfScoreSE) return ldaScore, ldaScoreSE, qdaScore, qdaScoreSE, rfScore, rfScoreSE, nClasses
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print( "\n\t\t------------------------------------------------------------------------------------------------------------------------\n" ) print( "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n" ) print( "\t\t------------------------------------------------------------------------------------------------------------------------\n" ) # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [ f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f)) ] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = drug_id1.split('_')[0].upper() drug2 = drug_id2.split('_')[0].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print( 'The comparison {} is not in the pair2comb dictionary!\n'. format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'. format(comparison)) sys.exit(10) results = get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure': {'None': np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'. format(num_dc)) print( 'Number of non-drug combinations after removing missing values:\t{}\n'. format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist( me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations( df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load( open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[ drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_ndc)) #-------------------------------------# # EVALUATE PERFORMANCE BY TARGETS # #-------------------------------------# img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) # Number of targets num_targets = [1, 3, 5, 7, 9] # Names of the methods if consider_se: if options.different_atc: types_analysis = [ 'dctargets', 'dcguild', 'dcstructure', 'dcse', 'random' ] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcse'] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcSE', 'Random'] types_analysis_labels = [ 'Target', 'PPI', 'Structure', 'Side Effects', 'Random' ] else: types_analysis = [ 'dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random' ] types_analysis2 = [ 'dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse' ] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcATC', 'dcSE', 'Random'] types_analysis_labels = [ 'Target', 'PPI', 'Structure', 'ATC', 'Side Effects', 'Random' ] else: types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'random'] types_analysis2 = ['dctargets', 'dcguild', 'dcstructure'] # Without random!! #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'Random'] types_analysis_labels = ['Target', 'PPI', 'Structure', 'Random'] # Machine learning parameters repetitions = 25 # Number of repetititons n_fold = 2 # Number of folds min_num_dc_group = 10 greater_or_smaller = 'greater' classifier = 'SVC best 1' classifiers = { 'KNeighbors': KNeighborsClassifier(3), 'SVC': SVC(probability=True), 'SVC linear': SVC(kernel="linear", C=0.025), 'SVC rbf': SVC(gamma=2, C=1), 'DecisionTree': DecisionTreeClassifier(max_depth=5), 'RandomForest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP': MLPClassifier(alpha=1), 'AdaBoost': AdaBoostClassifier(), 'GaussianNB': GaussianNB(), 'QuadraticDiscr.': QuadraticDiscriminantAnalysis(), 'SVC best 1': SVC(kernel="rbf", gamma=0.01, C=100, probability=True), 'SVC best 2': SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True) } if options.pca: pca_str = '_withPCA' else: pca_str = '_withoutPCA' if options.different_atc: atc_str = '_diff_ATC' else: atc_str = '' # Plot of distributions of AUC plot_auc_distribution = os.path.join( img_dir, 'numtargets_auc_distribution{}{}.{}'.format(atc_str, pca_str, fig_format)) # Plot of accuracy/sensitivity name acc_sens_dctargets = os.path.join( img_dir, 'numtargets_accsens_dctargets{}{}.{}'.format(atc_str, pca_str, fig_format)) acc_sens_dcguild = os.path.join( img_dir, 'numtargets_accsens_dcguild{}{}.{}'.format(atc_str, pca_str, fig_format)) acc_sens_dcstructure = os.path.join( img_dir, 'numtargets_accsens_dcstructure{}{}.{}'.format(atc_str, pca_str, fig_format)) acc_sens_dcatc = os.path.join( img_dir, 'numtargets_accsens_dcatc{}{}.{}'.format(atc_str, pca_str, fig_format)) acc_sens_dcse = os.path.join( img_dir, 'numtargets_accsens_dcse{}{}.{}'.format(atc_str, pca_str, fig_format)) # Results table results_table = os.path.join( tables_dir, 'numtargets_auc_table{}{}.txt'.format(atc_str, pca_str)) # Accuracy/Sensitivity results table prec_rec_table = os.path.join( tables_dir, 'numtargets_accsens_table{}{}.txt'.format(atc_str, pca_str)) # File with results of Mann Whitney tests mannwhitney_file = os.path.join( tables_dir, 'numtargets_mannwhitney{}{}.txt'.format(atc_str, pca_str)) # Get the targets file drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file)) # Get the ATC file drugbank_to_atcs_file = os.path.join(toolbox_dir, 'drugbank_to_atcs.pcl') drugbank_to_atcs = cPickle.load(open(drugbank_to_atcs_file)) # Get the DIANA IDs file diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) #-------------------------------------------------# # SELECT DRUG COMBINATIONS WITH DIFFERENT ATC # #-------------------------------------------------# if options.different_atc: selected_rows = [] for index, row in df.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() atcs_drug1 = set([atc[0] for atc in drugbank_to_atcs[drug1]]) atcs_drug2 = set([atc[0] for atc in drugbank_to_atcs[drug2]]) intersection = atcs_drug1 & atcs_drug2 if len(intersection) == 0: selected_rows.append(index) df = df.ix[selected_rows] dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Num drug combinations after removing the ones with same ATC in training: {}' .format(num_dc)) print( 'Num non-drug combinations after removing the ones with same ATC in training: {}' .format(num_ndc)) analysis_results = { } # Defining the dictionary that will store the results if consider_se: dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns( threshold_list, ATC_SE=consider_se) else: dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns( threshold_list, ATC_SE=consider_se) for num in num_targets: selected_rows = [] for index, row in df.iterrows(): (drug_id1, drug_id2) = index.split('---') drug1 = diana_id_to_drugbank[drug_id1].upper() drug2 = diana_id_to_drugbank[drug_id2].upper() if greater_or_smaller == 'greater': if len(drugbank_to_targets[drug1]) >= num and len( drugbank_to_targets[drug2]) >= num: selected_rows.append(index) elif greater_or_smaller == 'smaller': if len(drugbank_to_targets[drug1]) <= num and len( drugbank_to_targets[drug2]) <= num: selected_rows.append(index) else: print( '\nERROR: Please, for the parameter greater_or_smaller, select \'greater\' or \'smaller\'\n' ) sys.exit(10) df_tar = df.ix[selected_rows] if consider_se: if options.different_atc: list_methods = [['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcse', dcse_columns], ['random', columns]] else: list_methods = [['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns]] else: list_methods = [['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['random', columns]] for method, columns_method in list_methods: print('Evaluating {} targets with method {}\n'.format(num, method)) #------------------------------------------------------------------# # SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA # #------------------------------------------------------------------# if options.pca: variance_cut_off = 0.01 num_components = 0 df_method = df_tar[columns_method] df_raw = df_method.drop('combination', axis=1) raw_columns = copy.copy(columns_method) raw_columns.remove('combination') pca = PCA(n_components=None) pca.fit(df_raw) values_trans = pca.transform(df_raw) explained_variance = pca.explained_variance_ratio_ for column, var in sorted(zip(raw_columns, explained_variance), key=lambda x: x[1], reverse=True): #print(column, var) if var > variance_cut_off: num_components += 1 if num_components < len(raw_columns): print('Number of features:\t{}\n'.format(len(raw_columns))) print( 'Reduction to {} components\n'.format(num_components)) pca = PCA(n_components=num_components) pca.fit(df_raw) values_trans = pca.transform(df_raw) indexes = df_method.index.values df_trans = pd.DataFrame.from_records(values_trans, index=indexes) df_comb = df_method[['combination']] df_new = pd.concat([df_trans, df_comb], axis=1) df_method = df_new else: # Manually introduced features guild_thresholds = [1, 5] rank_scoring = ['spearman', 'dot_product'] list_scoring = ['jaccard'] if method == 'Combination' or method == 'random': selected_columns = diana_analysis.obtain_columns_best_features( guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se) else: selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method( method, guild_thresholds, rank_scoring, list_scoring) # Remove ATC columns if different ATC if options.different_atc and consider_se: selected_columns = [ col for col in selected_columns if col not in dcatc_columns or col == 'combination' ] print('Selected columns: {}\n'.format( ', '.join(selected_columns))) print('Number of selected features: {}\n'.format( len(selected_columns) - 1)) # We take away the combinations column # Define the new table with the selected columns df_method = df_tar[selected_columns] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) #------------------------------------------------------------------# dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Building {} repetition groups of {} (same) DC and {} (different) non-DC' .format(repetitions, num_dc, num_dc)) ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times mean_aucs = [ ] # Here we will store the means of AUCs from the cross-validations std_aucs = [ ] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs all_probs = [] # Here we store all the probabilities and labels num_repetitions = 0 for ndc_data_equal in ndc_repetitions: num_repetitions += 1 num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation if num_repetitions == 1: print( 'Building {} fold groups of {} DC and {} non-DC x {} repetitions' .format(n_fold, num_items_group, num_items_group, repetitions)) dc_groups = diana_analysis.obtain_n_groups_of_k_length( dc_data, n_fold, num_items_group, me_too_drug_combinations ) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length( ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations ) # Defining the non-drug combination groups in each cross-validation step merged_groups = [ pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups) ] if method == 'random': #mean, var, std, list_auc = run_nfold_crossvalidation_random(n_fold, merged_groups, classifiers[classifier]) mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy( n_fold, merged_groups, classifiers[classifier]) else: mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob( n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc all_probs = all_probs + list_prob final_mean = np.mean(all_aucs) #final_mean = np.mean(mean_aucs) std = np.std(all_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('STD: {}\n'.format(std)) #print('MEAN of STD: {}'.format(mean_std)) # Store the distribution of AUCs in the dictionary analysis_results.setdefault(num, {}) analysis_results[num].setdefault(method, {}) analysis_results[num][method]['all_aucs'] = all_aucs analysis_results[num][method]['all_probs'] = all_probs analysis_results[num][method]['mean'] = final_mean analysis_results[num][method]['std'] = std analysis_results[num][method]['num_dc'] = num_dc #------------------------------------# # PLOT PRECISION VS. SENSITIVITY # #------------------------------------# analysis_results = plot_precision_sensitivity(analysis_results, 'dctargets', num_targets, acc_sens_dctargets) analysis_results = plot_precision_sensitivity(analysis_results, 'dcguild', num_targets, acc_sens_dcguild) analysis_results = plot_precision_sensitivity(analysis_results, 'dcstructure', num_targets, acc_sens_dcstructure) if consider_se: if options.different_atc: analysis_results = plot_precision_sensitivity( analysis_results, 'dcse', num_targets, acc_sens_dcse) else: analysis_results = plot_precision_sensitivity( analysis_results, 'dcse', num_targets, acc_sens_dcse) analysis_results = plot_precision_sensitivity( analysis_results, 'dcatc', num_targets, acc_sens_dcatc) #----------------------------------------------------# # PLOT DISTRIBUTION OF AUC PER NUMBER OF TARGETS # #----------------------------------------------------# plot_auc_distributions(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se, different_atc=options.different_atc) #plot_violin(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se) #--------------------------------------------------------# # TABLE OF DISTRIBUTION OF AUC PER NUMBER OF TARGETS # #--------------------------------------------------------# with open(results_table, 'w') as results_table_fd: # Header results_table_fd.write(' ') for method in types_analysis_labels: results_table_fd.write('\t{}\t \t '.format(method)) results_table_fd.write('\n') for num in num_targets: results_table_fd.write('{}'.format(num)) for method in types_analysis: mean = analysis_results[num][method]['mean'] std = analysis_results[num][method]['std'] num_dc = analysis_results[num][method]['num_dc'] results_table_fd.write('\t{}\t{}\t{}'.format( mean, std, num_dc)) results_table_fd.write('\n') #----------------------------------------# # TABLE OF PRECISION VS. SENSITIVITY # #----------------------------------------# with open(prec_rec_table, 'w') as prec_rec_table_fd: # Header prec_rec_table_fd.write(' ') for method in types_analysis2: prec_rec_table_fd.write('\t{}\t '.format(method)) prec_rec_table_fd.write('\n') for num in num_targets: prec_rec_table_fd.write('{}'.format(num)) for method in types_analysis2: cut_off = analysis_results[num][method]['cut_off'] value = analysis_results[num][method]['value'] prec_rec_table_fd.write('\t{}\t{}'.format(cut_off, value)) prec_rec_table_fd.write('\n') #-------------------------------------------------------------------# # TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U # #-------------------------------------------------------------------# with open(mannwhitney_file, 'w') as mannwhitney_fd: mann_results = {} mannwhitney_fd.write(' \t ') for method in types_analysis_labels: mannwhitney_fd.write('\t{}'.format(method)) mannwhitney_fd.write('\n') # Perform the comparisons for num in num_targets: mann_results.setdefault(num, {}) for method1 in types_analysis: mann_results[num].setdefault(method1, {}) for method2 in types_analysis: if method1 == method2: mann_results[num][method1][method2] = '-' else: method1_dist = analysis_results[num][method1][ 'all_aucs'] method2_dist = analysis_results[num][method2][ 'all_aucs'] stat, pval = scipy.stats.mannwhitneyu( method1_dist, method2_dist) mann_results[num][method1][method2] = [stat, pval] # Write the table of crossings for num in num_targets: for method1 in types_analysis: mannwhitney_fd.write('{}\t{}'.format(num, method1)) for method2 in types_analysis: if method1 == method2: mannwhitney_fd.write('\t-') else: stat, pval = mann_results[num][method1][method2] mannwhitney_fd.write('\t{}, {:.2e}'.format(stat, pval)) mannwhitney_fd.write('\n') # End marker for time end = time.time() print( '\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n' .format(end - start, (end - start) / 60)) return
le = LabelEncoder() # Using only petal length and width as features X = dataset[:,2:4] Y = dataset[:,-1] # Encode class labels as numbers Y = le.fit_transform(Y) # Convert features to float as they are read as string X = X.astype(np.float) # Initialising the classifiers lda = LDA() qda = QDA() # Fit the data to the classifiers. lda.fit(X, Y) qda.fit(X, Y) # Creating variables that takes value that cover range of the training data x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.03), np.arange(y_min, y_max, 0.01)) # Predict a value for the points obtained above Z = lda.predict(np.c_[xx.ravel(), yy.ravel()]) Zq = qda.predict(np.c_[xx.ravel(), yy.ravel()]) Zq = Zq.reshape(xx.shape)
clf.score(X_test, y_test))) #%% Test with QDA from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA # Splits the data into a training set and randomized test set with accompanying labels X_train, X_test, y_train, y_test = train_test_split(data, classes, test_size=0.2) # Scales the data sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) 2 clf = QDA() clf.fit(X_train, y_train) print('Accuracy of QDA classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of QDA classifier on test set: {:.2f}'.format( clf.score(X_test, y_test))) #%% Test with Naive Bayes from sklearn.naive_bayes import GaussianNB as GNB # Splits the data into a training set and randomized test set with accompanying labels X_train, X_test, y_train, y_test = train_test_split(data, classes, test_size=0.2) # Scales the data sc = StandardScaler()
as_4 = 0.0 as_5 = 0.0 max_value = 300 for random_state in np.arange(1, max_value): # Confusion Matrix # First, split data in training and validation sets X_train, X_test, y_train, y_test = train_test_split(X, class_label, random_state=random_state) # Initialize Classificators clf_1 = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') clf_2 = AdaBoostClassifier(n_estimators=100) clf_3 = linear_model.LinearRegression() clf_4 = LinearDiscriminantAnalysis() clf_5 = QuadraticDiscriminantAnalysis() # Test classificator with training and validation data y_pred_1 = clf_1.fit(X_train, y_train).predict(X_test) y_pred_2 = clf_2.fit(X_train, y_train).predict(X_test) y_pred_3 = clf_3.fit(X_train, y_train).predict(X_test) y_pred_4 = clf_4.fit(X_train, y_train).predict(X_test) y_pred_5 = clf_5.fit(X_train, y_train).predict(X_test) ''' Z_1 = clf_1.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_2 = clf_2.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_3 = clf_3.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_3 = Z_3.reshape(xx.shape) Z_4 = clf_4.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z_5 = clf_5.decision_function(np.c_[xx.ravel(), yy.ravel()])
class road_estimation: def __init__(self, model_selection): self._train_data, self._train_targets, self._valid_data, self._valid_targets, self._test_data, self._test_targets = ( data_load() ) self._model_selection = model_selection self._classifier = [] def train(self): if self._model_selection == "svm": # selected the svc in svm self._classifier = svm.SVC() elif self._model_selection == "nb": self._classifier = GaussianNB() elif self._model_selection == "knn": # parameter n_jobs can be set to -1 to enable parallel calculating self._classifier = KNeighborsClassifier(n_neighbors=7) elif self._model_selection == "ada": # Bunch of parameters, n_estimators, learning_rate self._classifier = AdaBoostClassifier() elif self._model_selection == "rf": # many parameters including n_jobs self._classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) elif self._model_selection == "qda": # complicated array like parameters, perhaps leave it default self._classifier = QuadraticDiscriminantAnalysis() else: print "Please refer to one classifier" self._classifier.fit(self._train_data, self._train_targets) # predict on valid data prediction_valid = self._classifier.predict(self._valid_data) # print validation result for selected model. print ( "Classification report for classifier %s on valid_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._valid_targets, prediction_valid)) ) def test(self): # predict on test data prediction_test = self._classifier.predict(self.test_data) # print test result for selected model. print ( "Classification report for classifier %s on test_data:\n%s\n" % (self._model_selection, metrics.classification_report(self._test_targets, prediction_test)) ) def showPredictionImage(self): f = Feature() f.loadImage("um_000000.png") f.extractFeatures() fea_matrix = f.getFeaturesVectors() predict = self._classifier.predict(fea_matrix) image = np.copy(f.image) num_superpixels = np.max(f.superpixel) + 1 for i in xrange(0, num_superpixels): indices = np.where(f.superpixel == i) if predict[i] == 1: image[indices[0], indices[1], 0] = 1 image[indices[0], indices[1], 1] = 1 image[indices[0], indices[1], 2] = 0 plt.imshow(image) plt.show() # show prediction image with superpixels plt.imshow(mark_boundaries(image, superpixels)) plt.show()
def fit(self, _X_train, _y_train, _X_test, _y_test): if 'Statement' in _X_train.columns and 'Statement' in _X_test.columns: X_train = _X_train.copy() X_test = _X_test.copy() elif 'Statement' in _X_train.columns: print('Statement column not found in the testing data.') return elif 'Statement' in _X_test.columns: print('Statement column not found in the training data.') return X_train = self.extract(X_train) X_test = self.extract(X_test) y_train = _y_train.replace('pants-fire', 0).replace( 'false', 1).replace('barely-true', 2).replace('half-true', 3).replace('mostly-true', 4).replace('true', 5) y_test = _y_test.replace('pants-fire', 0).replace('false', 1).replace( 'barely-true', 2).replace('half-true', 3).replace('mostly-true', 4).replace('true', 5) names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(2), SVC(kernel="linear", C=0.025, probability=True, random_state=0), SVC(gamma=2, C=1, probability=True, random_state=0), DecisionTreeClassifier(max_depth=5, random_state=0), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=0), MLPClassifier(alpha=1, max_iter=1000, random_state=0), AdaBoostClassifier(random_state=0), GaussianNB(), QuadraticDiscriminantAnalysis() ] max_score = 0.0 max_class = '' for name, clf in zip(names, classifiers): start_time = time() clf.fit(X_train, y_train) score = 100.0 * clf.score(X_test, y_test) print( 'Classifier = %s, Score (test, accuracy) = %.2f,' % (name, score), 'Training time = %.2f seconds' % (time() - start_time)) if score > max_score: self.clf = clf max_score = score max_class = name print(80 * '-') print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' % (max_class, max_score))
splot.add_artist(ell) splot.set_xticks(()) splot.set_yticks(()) def plot_lda_cov(lda, splot): plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red') plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue') ############################################################################### for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariances=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis') plt.show()
def main(): df = pandas.read_csv(sys.argv[1]) generateSummary(df) df.columns = colNames # Means with NaNs means_with_nan = df.groupby( colNames[colNums[4]]).apply(lambda x: x.mean()) means_with_nan = means_with_nan.append(pandas.DataFrame( means_with_nan.mean(numeric_only=True)).T) means_with_nan.rename(index={0: 'mean'}, inplace=True) print("\nMeans with NaN:") print(means_with_nan) means_with_nan = means_with_nan.values.flatten() # Means without NaNs means_without_nan = df.dropna().groupby( colNames[colNums[4]]).apply(lambda x: x.mean()) means_without_nan = means_without_nan.append(pandas.DataFrame( means_without_nan.mean(numeric_only=True)).T) means_without_nan.rename(index={0: 'mean'}, inplace=True) print("\nMeans without NaN:") print(means_without_nan) means_without_nan = means_without_nan.values.flatten() # MCAR decision cs = chisquare(means_with_nan, means_without_nan) print("\nChi-square of (means_with_nan,means_without_nan) p-value:", cs.pvalue) if(cs.pvalue > 0.05): # CMAR - replace NaN by class mean print("P-value was not significant, data is MCAR. Imputing missing values") df[colNames[colNums[0]]] = df.groupby(colNames[colNums[4]])[ colNames[colNums[0]]].apply(lambda x: x.fillna(x.mean())) df[colNames[colNums[1]]] = df.groupby(colNames[colNums[4]])[ colNames[colNums[1]]].apply(lambda x: x.fillna(x.mean())) df[colNames[colNums[2]]] = df.groupby(colNames[colNums[4]])[ colNames[colNums[2]]].apply(lambda x: x.fillna(x.mean())) df[colNames[colNums[3]]] = df.groupby(colNames[colNums[4]])[ colNames[colNums[3]]].apply(lambda x: x.fillna(x.mean())) print("\nMeans after imputation:") means_after_impute = df.groupby( colNames[colNums[4]]).apply(lambda x: x.mean()) means_after_impute = means_after_impute.append( pandas.DataFrame(means_after_impute.mean(numeric_only=True)).T) means_after_impute.rename(index={0: 'mean'}, inplace=True) print(means_after_impute) else: # NMAR - drop rows with NaN print("P-value was significant, data is NMAR. Dropping rows with missing values") df.dropna(inplace=True) # Drop highly correlated control variables corr_matrix = df.corr().abs() upper_triangle = corr_matrix.where(numpy.triu( numpy.ones(corr_matrix.shape), k=1).astype(numpy.bool)) print("\nCalculating correlation between control variables:") print(upper_triangle) to_drop = [column for column in upper_triangle.columns if any( upper_triangle[column] > 0.95)] print("\nDropping variable with high multicollinearity: ", to_drop) df = df.drop(to_drop, axis=1) for i, x in enumerate(colNames): if x in to_drop: colNames.remove(x) colNums.remove(len(colNums)-1) # Create A/B split of data array = df.values X = array[:, 0:3] Y = array[:, 3] validation_size = 0.20 seed = 1 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size, random_state=seed) # Select variables to predict response chi2_selector = SelectKBest(score_func=chi2, k=2) chi2_selector.fit(X_train, Y_train) chi2_selector.fit_transform(X_train, Y_train) print("\nFeature Importance (Chi^2): " + str(chi2_selector.scores_)) feature_names = df.iloc[:, chi2_selector.get_support()].columns.values print("Selecting features: " + str(feature_names)) X = chi2_selector.transform(X) X_train = chi2_selector.transform(X_train) X_validation = chi2_selector.transform(X_validation) # Instiantiate model list scoring = 'accuracy' models = [] models.append(('LR', LogisticRegression( solver='liblinear', multi_class='ovr'))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(gamma='auto'))) models.append(('MLP', MLPClassifier(solver='lbfgs'))) models.append(('RFC', RandomForestClassifier( max_depth=5, n_estimators=10, max_features=1))) models.append(('GPC', GaussianProcessClassifier(1.0 * RBF(1.0)))) models.append(('ABC', AdaBoostClassifier())) models.append(('QDA', QuadraticDiscriminantAnalysis())) models.append(('SDG', SGDClassifier(max_iter=1000, tol=0.05))) models.append(('GBC', GradientBoostingClassifier())) models.append(('NSVC', NuSVC(probability=True, gamma='auto'))) # evaluate each model in turn results = [] names = [] print("\nEvaluating classifiers:") print("name,accuracy,std_dev") classifier_performance_dict = {} for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score( model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s,%f,%f" % (name, cv_results.mean(), cv_results.std()) classifier_performance_dict[name] = cv_results.mean() print(msg) # Combine classifiers using a vote classifier = VotingClassifier(models) classifier.fit(X_train, Y_train) results = model_selection.cross_val_score( classifier, X_train, Y_train, cv=kfold) # Execute some tests predictions = classifier.predict(X_validation) joined_testdata = numpy.concatenate( (X_validation, numpy.reshape(Y_validation, (-1, 1))), axis=1) joined_testdata_w_predictions = numpy.concatenate( (joined_testdata, numpy.reshape(predictions, (-1, 1))), axis=1) print("\nEnselble (vote) classifier validation test results:") print(feature_names[0]+","+feature_names[1]+",real-class,predicted-class") for row in joined_testdata_w_predictions: if row[2] != row[3]: print(row, " <== Misclassified") else: print(row) print("Accuracy: " + str(accuracy_score(Y_validation, predictions))) print(classification_report(Y_validation, predictions)) # Visualise results colors = {'virginica': 'red', 'setosa': 'blue', 'versicolor': 'green'} plt.scatter(df[feature_names[0]], df[feature_names[1]], c=df[colNames[colNums[3]]].apply(lambda x: colors[x])) for row in joined_testdata_w_predictions: if row[2] != row[3]: plt.scatter(row[0], row[1], c='black', marker='x') plt.xlabel(feature_names[0]) plt.ylabel(feature_names[1]) plt.title("Iris dataset") plt.show()
logreg = LogisticRegression().fit(X_train, y_train) y_pred = logreg.predict(X_test) y_pred_train = logreg.predict(X_train) log_acc = accuracy_score(y_pred, y_test) #0.64 highest clf = DecisionTreeClassifier().fit(X_train, y_train) y_pred = clf.predict(X_test) clf_acc = accuracy_score(y_pred, y_test) #0.61 neigh = KNeighborsClassifier(n_neighbors=13).fit(X_train, y_train) y_pred = neigh.predict(X_test) nn_acc = accuracy_score(y_pred, y_test) #0.61 quad = QuadraticDiscriminantAnalysis().fit(X_train, y_train) y_pred = quad.predict(X_test) quad_acc = accuracy_score(y_pred, y_test) # 0.19 very low ldaC = LDA(solver='lsqr', shrinkage='auto').fit(X_train, y_train) #LDA with shrinkage y_pred = ldaC.predict(X_test) lda_acc = accuracy_score(y_pred, y_test) #0.58 ######################################### from sklearn.cross_validation import KFold from sklearn.cross_validation import StratifiedKFold import matplotlib.pyplot as plt def calc_params(X, y, clf, param_values, param_name, K, metric = 'accuracy'): '''This function takes the classfier, the training data and labels, the name of the parameter to vary, a list of values to vary by, and a number of folds needed for
'_delta_sum.npy', '_abspwr.npy', '_abspwr_sum.npy', '_cog.npy', '_spec.npy' ] """ -----------0 ------------------1 ------------2--------------3-----------4---------5---------------6--------------7-----------8--------------9-----------------10------------11-------------12---------------13""" p = 1 knn = 5 #clf = KNeighborsClassifier(n_neighbors=knn) #clf = KNeighborsClassifier(n_neighbors=3) # #clf = KNeighborsClassifier(n_neighbors=7) # #clf = svm.SVC(kernel = 'rbf', C =3.0, decision_function_shape='ovr') # #clf = naive_bayes.GaussianNB() # clf = QuadraticDiscriminantAnalysis() # #clf = svm.SVC(kernel = 'poly', C =1.0, degree = 5, decision_function_shape='ovr') for i in range(0, x): h["happy{0}".format(i)] = np.load(s1_happy[i] + feature[p]) # Get Sad for j in range(0, y): c["calm{0}".format(j)] = np.load(s1_calm[j] + feature[p]) for k in range(0, z): a["angry{0}".format(k)] = np.load(s1_angry[k] + feature[p]) """-----------------Stack the Data ---------------------------------------------""" train = np.vstack((a['angry0'], a['angry1']))
def train_QDA(X_train, Y_train, X_test, reg_param): clf_Quad = QuadraticDiscriminantAnalysis(reg_param=reg_param) clf_Quad.fit(X_train, Y_train) Y_predict = clf_Quad.predict(X_test) return Y_predict
lda = LinearDiscriminantAnalysis() lda.fit(kbest_train_numpy, y_train) #lda.fit(x_train, y_train) #myprediction1 = lda.predict(x_test) predict_1 = lda.predict(kbest_test_numpy) #Use score to get the accuracy of the model print("Linear Discriminant Analysis Accuracy...") # score = lda.score(x_test, y_test) score = accuracy_score(y_test, predict_1) print(score * 100, "%") ################################################################## #Quadratic Discriminant Analysis print("Quadratic Discriminant Analysis") qda = QuadraticDiscriminantAnalysis() qda.fit(kbest_train_numpy, y_train) # qda.fit(x_train, y_train) #predict_2 = qda.predict(x_test) predict_2 = qda.predict(kbest_test_numpy) # print myprediction2 #Use score to get the accuracy of the model print("Quadratic Discriminant Analysis Accuracy...") # score = qda.score(x_test, y_test) score = accuracy_score(y_test, predict_2) print(score * 100, "%") ################################################################## #Logistic Regression print("Logistic Regression")
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #smoteen sme = SMOTEENN(random_state=42) os_X,os_y = sme.fit_sample(X_train,y_train) #QDA clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True) clf_QDA.fit(os_X, os_y) y_true, y_pred = y_test, clf_QDA.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity)
newdf.drop('protocol_type', axis=1, inplace=True) newdf.drop('service', axis=1, inplace=True) newdf_test=df_test.join(testdf_cat_data) newdf_test.drop('flag', axis=1, inplace=True) newdf_test.drop('protocol_type', axis=1, inplace=True) newdf_test.drop('service', axis=1, inplace=True) print(newdf_test['label'].value_counts()) features = newdf[final_columns].astype(float) features1 = newdf_test[final_columns].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis clf = QuadraticDiscriminantAnalysis() t0 = time() clf.fit(features, lab) tt = time() - t0 print ("Classifier trained in {} seconds.".format(round(tt, 3))) t0 = time() pred = clf.predict(features1) tt = time() - t0 print ("Predicted in {} seconds".format(round(tt,3))) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, lab1) print ("Accuracy is {}.".format(round(acc,4))) print(pd.crosstab(lab1, pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])) #Classifier trained in 4.315 seconds. #Predicted in 0.349 seconds
labels = [] for i in range(0,9): labels.append(1) for i in range(9,18): labels.append(2) for i in range(18, 27): labels.append(3) ''' # Creation of random labels for i in range(0,27): labels.append(int(random.random() * 3) + 1) print (labels) ''' # QDA model qda = QuadraticDiscriminantAnalysis() qda.fit(comps, labels) # MCC Calculation y_pred = qda.predict(comps) #print(labels) #print(y_pred) mcc = multimcc(labels,y_pred) print("MCC="+str(mcc)) ''' # Plotting QDA contour nx, ny = 200, 100 x_min, x_max = np.amin(comps[:,0]), np.amax(comps[:,0]) y_min, y_max = np.amin(comps[:,1]), np.amax(comps[:,1]) xx, yy = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))
def QuadDA(X_train, y_train, X_test, y_test): clf = QDA() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) return accuracy
def vis(short_patient, thresh_file=None): """ Visualize patient classification results for different classifiers :param short_patient: Patient to visualize :type short_patient: str :param thresh_file: Optional, txt file containing thresh data :type thresh_file: str :returns None, saves output in the format short_patient + '_{0}_pr_with_ML_and_pose'.format(emotion) """ if not thresh_file: thresh_file = short_patient + '_threshes.txt' thresh_dict = json.load( open(thresh_file)) if os.path.exists(thresh_file) else {} if not thresh_dict: out_q = multiprocessing.Manager().Queue() threshes = np.linspace(0, 1.5, 100) bar = ProgressBar(max_value=len(threshes)) f = functools.partial(thresh_calc, out_q, short_patient) for i, _ in enumerate(Pool().imap(f, threshes, chunksize=10)): while not out_q.empty(): thresh_dict.update(out_q.get()) bar.update(i) json.dump(thresh_dict, open(thresh_file, 'w')) for emotion in ['Happy', 'Angry', 'Sad', 'Disgust']: # precision-recall out_vals = {} for thresh in sorted(thresh_dict.keys()): if emotion in thresh_dict[thresh]: curr_emote_dict = thresh_dict[thresh][emotion] false_pos = curr_emote_dict['false_pos'] true_pos = curr_emote_dict['true_pos'] false_neg = curr_emote_dict['false_neg'] total_pos = true_pos + false_neg if total_pos and (false_pos + true_pos): precision = true_pos / (false_pos + true_pos) recall = true_pos / total_pos out_vals[thresh] = [precision, recall] x_vals = [out_vals[thresh][0] for thresh in sorted(out_vals.keys())] y_vals = [out_vals[thresh][1] for thresh in sorted(out_vals.keys())] z_vals = [float(x) for x in sorted(out_vals.keys())] if x_vals and y_vals and len(x_vals) == len(y_vals): fig = plt.figure() ax = fig.gca() ax.plot(x_vals, y_vals, label='Substring') OpenDir = sys.argv[sys.argv.index('-d') + 1] os.chdir(OpenDir) au_train, au_test, target_train, target_test = make_emotion_data( emotion, short_patient) classifier_dict = { KNeighborsClassifier(): 'KNeighbors', SVC(kernel='linear', probability=True): 'SVCLinear', SVC(probability=True): 'SVC', # GaussianProcessClassifier(), # DecisionTreeClassifier(), RandomForestClassifier(): 'RandomForest', ExtraTreesClassifier(): 'ExtraTrees', MLPClassifier(): 'MLP', AdaBoostClassifier(): 'AdaBoost', GaussianNB(): 'GaussianNB', QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis', BernoulliNB(): 'BernoulliNB' } for classifier in classifier_dict.keys(): expected, decision_function = use_classifier( classifier, au_train, au_test, target_train, target_test) precision, recall, thresholds = precision_recall_curve( expected, decision_function) ax.plot(precision, recall, label=classifier_dict[classifier]) ax.set_title('Performance of Different Methods for' + "\' " + emotion + " \'" + 'Recognition from Continuous AUs') ax.set_xlabel('Precision') ax.set_ylabel('Recall') ax.legend() fig.tight_layout() plt.savefig(short_patient + '_{0}_pr_with_ML_and_pose'.format(emotion)) plt.close()
def MyQDA(): return QuadraticDiscriminantAnalysis()