def QDA10(): myList = [] f=open("wdbc.data.txt") data=pd.read_csv("wdbc.data.txt", header=None, sep=r"\s+") X=np.array(data) Y=X[:,1] Y=np.where(Y=='M',1,0) X_new = X[:,[2,4,5,8,9,15,22,24,25,29]] seq = [.9, .8, .5, .25] for i in seq: X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X_new, Y, train_size=i, test_size=1-i, random_state=0) cl=discriminant_analysis.QuadraticDiscriminantAnalysis() cl.fit(X_train,Y_train) Z=cl.predict(X_test) scores = metrics.accuracy_score(Y_test, Z)*100 print("Quadratic Discriminant Analysis. Training:" , i*100 ,"%") print(metrics.classification_report(Y_test,Z)) print(metrics.confusion_matrix(Y_test,Z)) print("Accuracy: %0.2f" % (scores)) myList.append(scores) return myList
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper(columns=(6, 7, 8, 11, 12, 13, 14))), ( 'scale', preprocessing.StandardScaler( with_mean=True, with_std=False # this is not a typo! )), #('scale', preprocessing.RobustScaler( # with_centering=True, with_scaling=False, quantile_range=(1.0, 99.0) #)), ('expand', preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)), ('select', feature_selection.SelectPercentile( percentile=98, score_func=feature_selection.f_classif)), ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis( reg_param=0.0043)) ]) pipe.fit(x, y) self._model = pipe.predict
def get_algorithms(): MLA_dict = { # Ensemble methods "ada": ensemble.AdaBoostClassifier(), "bc": ensemble.BaggingClassifier(), "etc": ensemble.ExtraTreesClassifier(), "gbc": ensemble.GradientBoostingClassifier(), "rfc": ensemble.RandomForestClassifier(), # Gaussian processes "gpc": gaussian_process.GaussianProcessClassifier(), # Linear models "lr": linear_model.LogisticRegressionCV(), "pac": linear_model.PassiveAggressiveClassifier(), "rcc": linear_model.RidgeClassifierCV(), "sgd": linear_model.SGDClassifier(), "per": linear_model.Perceptron(), # Navies bayes "bnb": naive_bayes.BernoulliNB(), "gnb": naive_bayes.GaussianNB(), # Nearest neighbour "knn": neighbors.KNeighborsClassifier(), # SVM "svc": svm.SVC(probability=True), "nvc": svm.NuSVC(probability=True), "lvc": svm.LinearSVC(), # Trees "dtc": tree.DecisionTreeClassifier(), "ets": tree.ExtraTreeClassifier(), # Discriminant analysis "lda": discriminant_analysis.LinearDiscriminantAnalysis(), "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(), } return MLA_dict
def cross_validate_model(X_train, Y_train): """ Here we perform cross validation of models to choose the best one. """ # Divide the training and testing data train, test, y_actual, y_predict = train_test_split(X_train, Y_train, test_size=0.5, random_state=41) train_n, test_n, y_actual_n, y_predict_n = train_test_split(X_train, Y_train, test_size=0.5, random_state=0) # Add one hot encoder rf = ensemble.RandomForestClassifier(n_estimators=50, max_depth=5) rf_enc = OneHotEncoder() rf_lm = sklinear.LogisticRegression() rf.fit(train, y_actual) rf_enc.fit(rf.apply(train)) rf_lm.fit(rf_enc.transform(rf.apply(test)), y_predict) y_predict_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(test_n))) mse_rf_lm = metrics.mean_squared_error(y_predict_n, y_predict_rf_lm[:, 1]) print('MSE RandomForestClassifier followed by LogisticRegression is %f' % (mse_rf_lm)) # List the classification methods to use. clf_quaddis = discriminant_analysis.QuadraticDiscriminantAnalysis() clf_logreg = sklinear.LogisticRegression(penalty='l1') clf_random_forest = ensemble.RandomForestClassifier(n_estimators=50, max_depth=10) clf_adaboost = ensemble.AdaBoostClassifier(n_estimators=50) clf_mlpc = neural_network.MLPClassifier() clf_extra_tree = ensemble.ExtraTreesClassifier(n_estimators=50, bootstrap=True) # Add the above methods in an array # More ameable for looping methods = [ clf_quaddis, clf_logreg, clf_random_forest, clf_adaboost, clf_mlpc, clf_extra_tree ] methods_label = [ 'clf_quaddis', 'clf_logreg', 'clf_random_forest', 'clf_adaboost', 'clf_mlpc', 'clf_extra_tree' ] method_mse = np.zeros((len(methods), 1)) # Fit and predict for each method for i in range(len(methods)): methods[i].fit(train, y_actual) method_predict = methods[i].predict_proba(test) method_mse[i] = metrics.mean_squared_error(y_predict, method_predict[:, 1]) print('MSE for %s while cross validation : %f' % (methods_label[i], method_mse[i])) # We return the method which has the minimum mse return np.argmin(method_mse)
def main(): data = pd.read_csv('data_3_6.csv', names=['x', 'y', 'class']) max_x = data['x'].max() min_x = data['x'].min() max_y = data['y'].max() min_y = data['y'].min() trans_x = data['x'].transform(lambda x: (x - min_x) / (max_x - min_x)) trans_y = data['y'].transform(lambda x: (x - min_y) / (max_y - min_y)) reshape_x = trans_x.values.reshape(-1, 1) reshape_y = trans_y.values.reshape(-1, 1) reshape_class = data['class'].values.reshape(-1, 1).ravel() reshape_data = np.append(reshape_y, reshape_x, axis=1) nb_classifier = nb.MultinomialNB() nb_fit = nb_classifier.fit(reshape_data, reshape_class) nb_scores = ms.cross_val_score(nb_fit, reshape_data, reshape_class, cv=10) nb_est = ms.cross_val_predict(nb_fit, reshape_data, reshape_class, cv=10) nb_conf = met.confusion_matrix(reshape_class, nb_est) print("Naive Bayes - Score %f +/-%f" % (np.mean(nb_scores), np.std(nb_scores))) print(nb_conf, "\n") qda_classifier = da.QuadraticDiscriminantAnalysis() qda_fit = qda_classifier.fit(reshape_data, reshape_class) qda_scores = ms.cross_val_score(qda_fit, reshape_data, reshape_class, cv=10) qda_est = ms.cross_val_predict(qda_fit, reshape_data, reshape_class, cv=10) qda_conf = met.confusion_matrix(reshape_class, qda_est) print("QDA - Score %f +/-%f" % (np.mean(qda_scores), np.std(qda_scores))) print(qda_conf, "\n") lda_classifier = da.LinearDiscriminantAnalysis() lda_fit = lda_classifier.fit(reshape_data, reshape_class) lda_scores = ms.cross_val_score(lda_fit, reshape_data, reshape_class, cv=10) lda_est = ms.cross_val_predict(lda_fit, reshape_data, reshape_class, cv=10) lda_conf = met.confusion_matrix(reshape_class, lda_est) print("LDA - Score %f +/-%f" % (np.mean(lda_scores), np.std(lda_scores))) print(lda_conf, "\n") plt.figure() mlxplt.plot_decision_regions(reshape_data, reshape_class, clf=nb_fit) plt.figure() mlxplt.plot_decision_regions(reshape_data, reshape_class, clf=qda_fit) plt.figure() mlxplt.plot_decision_regions(reshape_data, reshape_class, clf=lda_fit) plt.show()
def qda(X_tra, y_tra, X_val, y_val, index_no, classifier_num): y_tra, X_tra, y_val, X_val, weights = dataRegulationSKL( y_tra, X_tra, y_val, X_val, index_no) clf = skdisa.QuadraticDiscriminantAnalysis() clf.fit(X_tra, y_tra) return processLearning(clf, X_tra, y_tra, X_val, y_val)
def calc_fitness(self, data, target): if self.changed: nfolds = 4 scores = np.zeros(nfolds) precision = np.zeros(nfolds) recall = np.zeros(nfolds) X = np.copy(data) for i in range(0, len(self.genome)): if self.genome[len(self.genome) - 1 - i] == 0: X = np.delete(X, len(self.genome) - 1 - i, 1) i = 0 skf = cross_validation.StratifiedKFold(n_splits=nfolds) for train, test in skf.split(X, target): if self.type == 'dt': self.clf = tree.DecisionTreeClassifier( criterion='entropy', splitter='random').fit(X[train], target[train]) elif self.type == 'svm': self.clf = svm.SVC(kernel='linear').fit( X[train], target[train]) elif self.type == 'knn': self.clf = knn.KNeighborsClassifier().fit( X[train], target[train]) elif self.type == 'lr': self.clf = lm.LogisticRegression().fit( X[train], target[train]) elif self.type == 'nb': self.clf = nb.GaussianNB().fit(X[train], target[train]) elif self.type == 'rf': self.clf = ens.RandomForestClassifier().fit( X[train], target[train]) elif self.type == 'et': self.clf = ens.ExtraTreesClassifier().fit( X[train], target[train]) elif self.type == 'mlp': self.clf = nn.MLPClassifier( hidden_layer_sizes=(40, 5)).fit(X[train], target[train]) elif self.type == 'lda': self.clf = da.LinearDiscriminantAnalysis().fit( X[train], target[train]) elif self.type == 'qda': self.clf = da.QuadraticDiscriminantAnalysis().fit( X[train], target[train]) else: self.clf = None p = self.clf.predict(X[test]) scores[i] = metrics.accuracy_score(target[test], p) precision[i] = metrics.precision_score(target[test], p) recall[i] = metrics.recall_score(target[test], p) i += 1 self.accuracy = scores.mean() self.std = scores.std() self.precision = precision.mean() self.recall = recall.mean() self.changed = False
def QDA(self, source): #这个方法居然没在HTML里面????????????????????????????、 min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = discriminant_analysis.QuadraticDiscriminantAnalysis( n_components=2) print(pca.covariance_) result = {} result['data'] = pca.fit_transform(data_source) result['params'] = 0 return result
def deserialize_qda(model_dict): model = discriminant_analysis.QuadraticDiscriminantAnalysis( **model_dict['params']) model.means_ = np.array(model_dict['means_']).astype(np.float64) model.priors_ = np.array(model_dict['priors_']).astype(np.float64) model.scalings_ = np.array(model_dict['scalings_']).astype(np.float64) model.rotations_ = np.array(model_dict['rotations_']).astype(np.float64) model.classes_ = np.array(model_dict['classes_']).astype(np.int64) return model
def getBestFeaturesForQDA(trainingData): x = trainingData.iloc[:, 0:11] y = trainingData.iloc[:, 11] bestFeatures = sfs( da.QuadraticDiscriminantAnalysis(), k_features="best", forward=False, floating=False, verbose=False, scoring='r2', ).fit(x, y) return bestFeatures.k_feature_names_, bestFeatures.k_feature_idx_
def getBestFeaturesForHigherOrderTerms(trainingData, num_features): x = trainingData.loc[:, trainingData.columns != 'label'] y = trainingData.loc[:, 'label'] bestFeatures = sfs( da.QuadraticDiscriminantAnalysis(), k_features=num_features, forward=True, floating=False, verbose=2, scoring='r2', ).fit(x, y) return bestFeatures.k_feature_names_
def __init__(self, df, run_prefix, algs_name=None, seed=42): # code that will prepare the data y = df.PHENO X = df.drop(columns=['PHENO']) # Split the data X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3, random_state=seed) # 70:30 IDs_train = X_train.ID IDs_test = X_test.ID X_train = X_train.drop(columns=['ID']) X_test = X_test.drop(columns=['ID']) # Saving the prepped data the other classes will need self.df = df self.run_prefix = run_prefix self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.IDs_train = IDs_train self.IDs_test = IDs_test # Where the results will be stored self.log_table = None self.best_algo = None self.algo = None self.rfe_df = None # The methods we will use if algs_name is None: self.algorithms = [ linear_model.LogisticRegression(solver='lbfgs'), ensemble.RandomForestClassifier(n_estimators=100), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), linear_model.SGDClassifier(loss='modified_huber'), svm.SVC(probability=True, gamma='scale'), neural_network.MLPClassifier(), neighbors.KNeighborsClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ensemble.BaggingClassifier(), xgboost.XGBClassifier() ] else: algorithms = [] for algo_name in algs_name: algorithms.append(self.getAlgorithmFromName(algo_name)) self.algorithms = algorithms
def classification_models(): """ Classification Models """ return { 'kneighbors': neighbors.KNeighborsClassifier(), 'svc_lin': svm.SVC(kernel='linear', probability=True), 'svc_rbf': svm.SVC(probability=True), 'svc_poly': svm.SVC(kernel='poly', degree=2, probability=True), 'decision_tree': tree.DecisionTreeClassifier(), 'random_forest': ensemble.RandomForestClassifier(), 'adaboost': ensemble.AdaBoostClassifier(), 'gaussian_nb': naive_bayes.GaussianNB(), 'lin_da': discriminant_analysis.LinearDiscriminantAnalysis(), 'quad_da': discriminant_analysis.QuadraticDiscriminantAnalysis() }
def test_models(X, y, repeat_x): scores = pd.DataFrame(columns=['LogReg', 'LDA', 'QDA']) for i in range(0, repeat_x): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) model_lr = linear_model.LogisticRegression() lr_score = model_fit_score(model_lr, X_train, X_test, y_train, y_test) model_lda = discriminant_analysis.LinearDiscriminantAnalysis() lda_score = model_fit_score(model_lda, X_train, X_test, y_train, y_test) model_qda = discriminant_analysis.QuadraticDiscriminantAnalysis() qda_score = model_fit_score(model_qda, X_train, X_test, y_train, y_test) i_test_run = pd.DataFrame([[lr_score, lda_score, qda_score]], columns=['LogReg', 'LDA', 'QDA']) scores = scores.append(i_test_run, ignore_index=True) return scores
def getAlgorithmFromName(self, alg_name): algo = None if alg_name == 'LogisticRegression': algo = linear_model.LogisticRegression(solver='lbfgs') elif alg_name == 'RandomForestClassifier': algo = ensemble.RandomForestClassifier(n_estimators=100) elif alg_name == 'AdaBoostClassifier': algo = ensemble.AdaBoostClassifier() elif alg_name == 'GradientBoostingClassifier': algo = ensemble.GradientBoostingClassifier() elif alg_name == 'SGDClassifier': algo = linear_model.SGDClassifier(loss='modified_huber') elif alg_name == 'SVC': algo = svm.SVC(probability=True, gamma='scale') elif alg_name == 'MLPClassifier': algo = neural_network.MLPClassifier() elif alg_name == 'KNeighborsClassifier': algo = neighbors.KNeighborsClassifier() elif (alg_name == 'LinearDiscriminantAnalysis'): algo = discriminant_analysis.LinearDiscriminantAnalysis() elif alg_name == 'QuadraticDiscriminantAnalysis': algo = discriminant_analysis.QuadraticDiscriminantAnalysis() elif alg_name == 'BaggingClassifier': algo = ensemble.RandomForestClassifier(n_estimators=100) elif alg_name == 'ComplementNB': algo = ensemble.BaggingClassifier() elif alg_name == 'XGBClassifier': algo = xgboost.XGBClassifier() else: sys.exit('Algorithm name ' + alg_name + ' incorrect, please check it') return algo
def ModelSelection(test_data, features, label): MLA = [ ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), gaussian_process.GaussianProcessClassifier(), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ] MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score'] MLA_compare = pd.DataFrame(columns=MLA_columns) x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[label], test_size=0.2) row_index = 0 MLA_predict = train_data[label] for alg in MLA: MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) alg.fit(x_train, y_train) MLA_predict[MLA_name] = alg.predict(x_test) MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test) row_index += 1 MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True) return MLA_compare, x_train, x_test, y_train, y_test
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler()), ('select', feature_selection.SelectPercentile()), ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis()), ]) param_grid = [{ 'scale__with_mean': [True], 'scale__with_std': [True], #'select__percentile': [i for i in range(40, 81, 3)], 'select__percentile': [i for i in range(40, 51)], 'select__score_func': [ feature_selection.f_classif, feature_selection.mutual_info_classif ], 'estim__reg_param': [0.1 + 0.025 * i for i in range(-1, 2)] }] grid = model_selection.GridSearchCV( pipe, cv=9, n_jobs=16, param_grid=param_grid, verbose=1, scoring=metrics.make_scorer(metrics.accuracy_score), ) grid.fit(x, y) print('Optimal Hyperparametres:') print('=======================') for step in grid.best_estimator_.steps: print(step) print("CV Score:", grid.best_score_) estimator = pipe.named_steps['estim'] if hasattr(estimator, 'transduction_'): self._transduction = estimator.transduction_ self._model = grid.predict
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper(columns=(6, 7, 8, 11, 12, 13, 14))), #('select', feature_selection.SelectKBest()), ('scale', preprocessing.StandardScaler()), ('expand', preprocessing.PolynomialFeatures()), ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis()), ]) param_grid = [{ #'select__k': [i for i in range(15, 21)], #'select__score_func': [feature_selection.f_classif], 'scale__with_mean': [True, False], 'scale__with_std': [True], 'expand__include_bias': [False, True], 'expand__interaction_only': [False, True], 'expand__degree': [1, 2] #'estim__reg_param': [0.5] #'estim__alpha': list(0.001 + 1 * i for i in range(0, 5)) }] grid = model_selection.GridSearchCV( pipe, cv=10, n_jobs=1, param_grid=param_grid, verbose=1, scoring=metrics.make_scorer(metrics.accuracy_score), ) grid.fit(x, y) print('Optimal Hyperparametres:') print('=======================') for step in grid.best_estimator_.steps: print(step) print("CV Score:", grid.best_score_) self._model = grid.predict
def all_classifiers(): # Model Data MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] return MLA
def __init__(self, type="linear_regression", regularization=False, n_estimators=100, subsample=1.0, max_depth=3, c=80, e=0.001): if type == "linear_regression": self.model = linear_model.LinearRegression(normalize=True) elif type == "ridge": self.model = linear_model.Ridge() elif type == "SVM": self.model = svm.SVR(kernel='rbf', gamma='auto', C=c, epsilon=e) elif type == 'XGBoost': self.model = ensemble.GradientBoostingRegressor( n_estimators=n_estimators, subsample=subsample, max_depth=max_depth) elif type == 'BaggingRegressor': self.model = ensemble.BaggingRegressor() elif type == 'RandomForest': self.model = ensemble.RandomForestRegressor( n_estimators=n_estimators, max_depth=max_depth) elif type == "AdaBoostRegressor": self.model = ensemble.AdaBoostRegressor(n_estimators=n_estimators) elif type == 'ExtraTreesRegressor': self.model = ensemble.ExtraTreesRegressor( n_estimators=n_estimators, max_depth=max_depth) elif type == 'Lasso': self.model = linear_model.Lasso() elif type == "qda": self.model = discriminant_analysis.QuadraticDiscriminantAnalysis() elif type == "lda": self.model = discriminant_analysis.LinearDiscriminantAnalysis() elif type == 'XGBoost with Bagging': self.model = ensemble.BaggingRegressor( base_estimator=ensemble.GradientBoostingRegressor( n_estimators=100, subsample=1.0, max_depth=3), n_estimators=n_estimators) elif type == "Gaussian Process": self.model = gaussian_process.GaussianProcessRegressor()
def __init__(self, df, run_prefix, max_iter, cv_count): self.run_prefix = run_prefix self.max_iter = max_iter self.cv_count = cv_count self.y_tune = df.PHENO self.IDs_tune = df.ID self.X_tune = df.drop(columns=['PHENO', 'ID']) best_algo_name_in = run_prefix + '.best_algorithm.txt' best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False) self.best_algo = str(best_algo_df.iloc[0, 0]) self.algorithms = [ linear_model.LogisticRegression(), ensemble.RandomForestClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), linear_model.SGDClassifier(loss='modified_huber'), svm.SVC(probability=True), neural_network.MLPClassifier(), neighbors.KNeighborsClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ensemble.BaggingClassifier(), xgboost.XGBClassifier() ] self.log_table = None self.best_algo_name_in = None self.best_algo_df = None self.hyperparameters = None self.scoring_metric = None self.cv_tuned = None self.cv_baseline = None self.algo = None self.searchCVResults = None self.rand_search = None self.algo_tuned = None self.tune_out = None
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=True )), ('select', feature_selection.SelectPercentile( percentile=46, score_func=feature_selection.mutual_info_classif )), ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis( reg_param=0.1 )) ]) pipe.fit(x, y) self._model = pipe.predict
def testPCAOnDifferentClassifiers(): qda = da.QuadraticDiscriminantAnalysis() trainingX, trainingY, testingX, testingY = getPCATraingAndTesting(featurePercentageThreshold) qda.fit(trainingX, trainingY) score = qda.score(testingX, testingY) print(f'QDA score: {score}') rfc = RandomForestClassifier(n_estimators=500) rfc.fit(trainingX, trainingY) score = rfc.score(testingX, testingY) print(f'RandomForests: {score}') supportClf = svm.LinearSVC() supportClf.fit(trainingX, trainingY) score = supportClf.score(testingX, testingY) print(f'SVC Score: {score}') kNeighbor = KNeighborsClassifier() kNeighbor.fit(trainingX, trainingY) score = kNeighbor.score(testingX, testingY) print(f'KNearestNeighbors Score: {score}')
######################################################################################## from sklearn import decomposition, discriminant_analysis # pon el path correcto, el archivo está en repo/umucv/data mnist = np.load("../../../data/mnist.npz") xl, yl, xt, yt = [mnist[d] for d in ['xl', 'yl', 'xt', 'yt']] cl = np.argmax(yl, axis=1) ct = np.argmax(yt, axis=1) transformer = decomposition.PCA(n_components=40).fit(xl) xrl = transformer.transform(xl) xrt = transformer.transform(xt) maq = discriminant_analysis.QuadraticDiscriminantAnalysis( store_covariance=True).fit(xrl, cl) print((maq.predict(xrt) == ct).mean()) def classifyG(xs): t = np.array(xs).reshape(-1, 28 * 28) p = maq.predict_proba(transformer.transform(t)) r = np.argmax(p, axis=1) pm = np.max(p, axis=1) return r, pm ######################################################################################## # elegimos la red convolucional
from sklearn import discriminant_analysis from sklearn import tree from sklearn import neighbors from sklearn.metrics import accuracy_score #Data and labels [height, weight, shoe size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = ['male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male'] #Classifiers clf1 = discriminant_analysis.QuadraticDiscriminantAnalysis() clf2 = tree.DecisionTreeClassifier() clf3 = neighbors.KNeighborsClassifier() #Train Model clf1 = clf1.fit(X,Y) clf2 = clf2.fit(X,Y) clf3 = clf3.fit(X,Y) _X=[[184,84,44],[198,92,48],[183,83,44],[166,47,36],[170,60,38],[172,64,39],[182,80,42],[180,80,43]] _Y=['male','male','male','female','female','female','male','male'] #Prediction prediction1 = clf1.predict(_X) prediction2 = clf2.predict(_X) prediction3 = clf3.predict(_X) #Result r1 = accuracy_score(_Y,prediction1) r2 = accuracy_score(_Y,prediction2)
def train_qda(allData): Y = np.array(allData['label']) X = np.array(allData.loc[:, allData.columns != 'label']) clf = da.QuadraticDiscriminantAnalysis() clf.fit(X, Y) return clf
# bestFeaturesQda = train_qda(bestFeaturesTrainingData) # testQda(bestFeaturesQda, bestFeaturesTestingData, "With forward subset selection") # multDf = pd.read_csv(os.path.dirname(os.path.abspath(__file__))+'/data/TrainData_Multiplicative.csv') # multTraining, multTesting = partionData(multDf, .8) # bestFeatures = getBestFeaturesForHigherOrderTerms(multTraining, 11) # #bestFeatures = list(['volatile acidity*pH*', 'density*alcohol*', 'volatile acidity*citric acid*pH*', 'volatile acidity*density*sulphates*', 'free sulfur dioxide*pH*alcohol*', 'volatile acidity*total sulfur dioxide*density*sulphates*', 'citric acid*residual sugar*density*sulphates*alcohol*']) # bestDfX = multTraining.loc[:,bestFeatures] # trainingY = multTraining['label'] # bestDfX.insert(loc = len(bestDfX.columns),column='label', value=trainingY) # bestFeaturesQda = train_qda(bestDfX) # testingY = multTesting.loc[:,'label'] # bestDfTesting = multTesting.loc[:, bestFeatures] # bestDfTesting.insert(loc = len(bestDfTesting.columns),column='label', value=testingY) # testQda(bestFeaturesQda,bestDfTesting,f'Testing with labels {bestFeatures}') # print(f'Test\n {bestDfTesting}\nTestY\n{trainingY}') #Run QDA on PCA data qda = da.QuadraticDiscriminantAnalysis() trainingX, trainingY, testingX, testingY = PCA.getPCATraingAndTesting(.95) qda.fit(trainingX, trainingY) score = qda.score(testingX, testingY) print(score)
def run_param_search(model_name, X, y, scale_data=True): if scale_data: X = scale(X) if model_name == 'LogisticRegression': model = linear_model.LogisticRegression() params = { 'penalty': ['l1', 'l2'], 'C': stats.lognorm(s=3), } if model_name == 'LDA': model = discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr') params = { 'shrinkage': stats.uniform(loc=0, scale=1) } if model_name == 'QDA': model = discriminant_analysis.QuadraticDiscriminantAnalysis() params = { 'reg_param': stats.uniform(loc=0, scale=1) } if model_name == 'SVM': # the polynomial kernel appears to be numerically unstable, and I # could not comsistently get if to work model = svm.SVC() params = { 'C': stats.lognorm(s=2), 'kernel': ['rbf', 'sigmoid'], } if model_name == 'AdaBoost': model = ensemble.AdaBoostClassifier() params = { 'n_estimators': stats.randint(low=100, high=1500), 'learning_rate': stats.uniform(loc=0.5, scale=0.5), } if model_name == 'GradientBoosting': model = ensemble.GradientBoostingClassifier() params = { 'n_estimators': stats.randint(low=100, high=1500), 'learning_rate': stats.uniform(loc=0.05, scale=0.95), 'max_depth': stats.randint(low=3, high=8), 'subsample': stats.uniform(loc=0.5, scale=0.5), } if model_name == 'RandomForest': model = ensemble.RandomForestClassifier() params = { 'n_estimators': stats.randint(low=100, high=1000), 'max_features': stats.randint(low=1, high=12), 'min_samples_leaf': stats.randint(low=1, high=10), } if model_name == 'ExtraTrees': model = ensemble.ExtraTreesClassifier() params = { 'n_estimators': stats.randint(low=100, high=1000), 'max_features': stats.randint(low=1, high=12), 'min_samples_leaf': stats.randint(low=1, high=10), } param_search = (model_selection .RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=200, cv=10, n_jobs=4, return_train_score=True, verbose=1)) param_search.fit(X, y) best_param_indices = np.argsort(-param_search .cv_results_['mean_test_score'])[0:10] return (param_search, best_param_indices)
def get_skl_estimator(self, **default_parameters): return discriminant_analysis.QuadraticDiscriminantAnalysis( **default_parameters)
MLA = [ # Generalized Linear Models LogisticRegressionCV(), # SVM svm.SVC(probability = True), svm.LinearSVC(), # KNN neighbors.KNeighborsClassifier(weights='distance'), #Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # Naive Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), #Trees tree.DecisionTreeClassifier(), # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier()