示例#1
0
def QDA10():
    myList = []
    f=open("wdbc.data.txt")
    data=pd.read_csv("wdbc.data.txt", header=None, sep=r"\s+")
    X=np.array(data)
    Y=X[:,1]
    Y=np.where(Y=='M',1,0)
    
    X_new =  X[:,[2,4,5,8,9,15,22,24,25,29]] 
    
    seq = [.9, .8, .5, .25]
    for i in seq:
        X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X_new, Y, train_size=i, test_size=1-i, random_state=0)
    
        cl=discriminant_analysis.QuadraticDiscriminantAnalysis()                                                                    
        cl.fit(X_train,Y_train)
        Z=cl.predict(X_test)
        scores = metrics.accuracy_score(Y_test, Z)*100
        
        print("Quadratic Discriminant Analysis. Training:" , i*100 ,"%")
        print(metrics.classification_report(Y_test,Z))
        print(metrics.confusion_matrix(Y_test,Z))
        print("Accuracy: %0.2f" % (scores))
        
        myList.append(scores)
        
    return myList
示例#2
0
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop',
             transformers.ColumnDropper(columns=(6, 7, 8, 11, 12, 13, 14))),
            (
                'scale',
                preprocessing.StandardScaler(
                    with_mean=True,
                    with_std=False  # this is not a typo!
                )),
            #('scale', preprocessing.RobustScaler(
            #    with_centering=True, with_scaling=False, quantile_range=(1.0, 99.0)
            #)),
            ('expand',
             preprocessing.PolynomialFeatures(degree=2,
                                              interaction_only=False,
                                              include_bias=False)),
            ('select',
             feature_selection.SelectPercentile(
                 percentile=98, score_func=feature_selection.f_classif)),
            ('estim',
             discriminant_analysis.QuadraticDiscriminantAnalysis(
                 reg_param=0.0043))
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
示例#3
0
def get_algorithms():
    MLA_dict = {
        # Ensemble methods
        "ada": ensemble.AdaBoostClassifier(),
        "bc": ensemble.BaggingClassifier(),
        "etc": ensemble.ExtraTreesClassifier(),
        "gbc": ensemble.GradientBoostingClassifier(),
        "rfc": ensemble.RandomForestClassifier(),
        # Gaussian processes
        "gpc": gaussian_process.GaussianProcessClassifier(),
        # Linear models
        "lr": linear_model.LogisticRegressionCV(),
        "pac": linear_model.PassiveAggressiveClassifier(),
        "rcc": linear_model.RidgeClassifierCV(),
        "sgd": linear_model.SGDClassifier(),
        "per": linear_model.Perceptron(),
        # Navies bayes
        "bnb": naive_bayes.BernoulliNB(),
        "gnb": naive_bayes.GaussianNB(),
        # Nearest neighbour
        "knn": neighbors.KNeighborsClassifier(),
        # SVM
        "svc": svm.SVC(probability=True),
        "nvc": svm.NuSVC(probability=True),
        "lvc": svm.LinearSVC(),
        # Trees
        "dtc": tree.DecisionTreeClassifier(),
        "ets": tree.ExtraTreeClassifier(),
        # Discriminant analysis
        "lda": discriminant_analysis.LinearDiscriminantAnalysis(),
        "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(),
    }
    return MLA_dict
示例#4
0
def cross_validate_model(X_train, Y_train):
    """
	Here we perform cross validation of models to choose the best one.
	"""
    # Divide the training and testing data
    train, test, y_actual, y_predict = train_test_split(X_train,
                                                        Y_train,
                                                        test_size=0.5,
                                                        random_state=41)
    train_n, test_n, y_actual_n, y_predict_n = train_test_split(X_train,
                                                                Y_train,
                                                                test_size=0.5,
                                                                random_state=0)

    # Add one hot encoder
    rf = ensemble.RandomForestClassifier(n_estimators=50, max_depth=5)
    rf_enc = OneHotEncoder()
    rf_lm = sklinear.LogisticRegression()
    rf.fit(train, y_actual)
    rf_enc.fit(rf.apply(train))
    rf_lm.fit(rf_enc.transform(rf.apply(test)), y_predict)
    y_predict_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(test_n)))
    mse_rf_lm = metrics.mean_squared_error(y_predict_n, y_predict_rf_lm[:, 1])
    print('MSE RandomForestClassifier followed by LogisticRegression is %f' %
          (mse_rf_lm))

    # List the classification methods to use.
    clf_quaddis = discriminant_analysis.QuadraticDiscriminantAnalysis()
    clf_logreg = sklinear.LogisticRegression(penalty='l1')
    clf_random_forest = ensemble.RandomForestClassifier(n_estimators=50,
                                                        max_depth=10)
    clf_adaboost = ensemble.AdaBoostClassifier(n_estimators=50)
    clf_mlpc = neural_network.MLPClassifier()
    clf_extra_tree = ensemble.ExtraTreesClassifier(n_estimators=50,
                                                   bootstrap=True)

    # Add the above methods in an array
    # More ameable for looping
    methods = [
        clf_quaddis, clf_logreg, clf_random_forest, clf_adaboost, clf_mlpc,
        clf_extra_tree
    ]
    methods_label = [
        'clf_quaddis', 'clf_logreg', 'clf_random_forest', 'clf_adaboost',
        'clf_mlpc', 'clf_extra_tree'
    ]

    method_mse = np.zeros((len(methods), 1))
    # Fit and predict for each method
    for i in range(len(methods)):
        methods[i].fit(train, y_actual)
        method_predict = methods[i].predict_proba(test)
        method_mse[i] = metrics.mean_squared_error(y_predict,
                                                   method_predict[:, 1])
        print('MSE for %s while cross validation : %f' %
              (methods_label[i], method_mse[i]))

    # We return the method which has the minimum mse
    return np.argmin(method_mse)
示例#5
0
def main():
    data = pd.read_csv('data_3_6.csv', names=['x', 'y', 'class'])

    max_x = data['x'].max()
    min_x = data['x'].min()
    max_y = data['y'].max()
    min_y = data['y'].min()

    trans_x = data['x'].transform(lambda x: (x - min_x) / (max_x - min_x))
    trans_y = data['y'].transform(lambda x: (x - min_y) / (max_y - min_y))

    reshape_x = trans_x.values.reshape(-1, 1)
    reshape_y = trans_y.values.reshape(-1, 1)
    reshape_class = data['class'].values.reshape(-1, 1).ravel()

    reshape_data = np.append(reshape_y, reshape_x, axis=1)

    nb_classifier = nb.MultinomialNB()
    nb_fit = nb_classifier.fit(reshape_data, reshape_class)
    nb_scores = ms.cross_val_score(nb_fit, reshape_data, reshape_class, cv=10)
    nb_est = ms.cross_val_predict(nb_fit, reshape_data, reshape_class, cv=10)
    nb_conf = met.confusion_matrix(reshape_class, nb_est)
    print("Naive Bayes - Score %f +/-%f" %
          (np.mean(nb_scores), np.std(nb_scores)))
    print(nb_conf, "\n")

    qda_classifier = da.QuadraticDiscriminantAnalysis()
    qda_fit = qda_classifier.fit(reshape_data, reshape_class)
    qda_scores = ms.cross_val_score(qda_fit,
                                    reshape_data,
                                    reshape_class,
                                    cv=10)
    qda_est = ms.cross_val_predict(qda_fit, reshape_data, reshape_class, cv=10)
    qda_conf = met.confusion_matrix(reshape_class, qda_est)
    print("QDA - Score %f +/-%f" % (np.mean(qda_scores), np.std(qda_scores)))
    print(qda_conf, "\n")

    lda_classifier = da.LinearDiscriminantAnalysis()
    lda_fit = lda_classifier.fit(reshape_data, reshape_class)
    lda_scores = ms.cross_val_score(lda_fit,
                                    reshape_data,
                                    reshape_class,
                                    cv=10)
    lda_est = ms.cross_val_predict(lda_fit, reshape_data, reshape_class, cv=10)
    lda_conf = met.confusion_matrix(reshape_class, lda_est)
    print("LDA - Score %f +/-%f" % (np.mean(lda_scores), np.std(lda_scores)))
    print(lda_conf, "\n")

    plt.figure()
    mlxplt.plot_decision_regions(reshape_data, reshape_class, clf=nb_fit)

    plt.figure()
    mlxplt.plot_decision_regions(reshape_data, reshape_class, clf=qda_fit)

    plt.figure()
    mlxplt.plot_decision_regions(reshape_data, reshape_class, clf=lda_fit)

    plt.show()
示例#6
0
def qda(X_tra, y_tra, X_val, y_val, index_no, classifier_num):

    y_tra, X_tra, y_val, X_val, weights = dataRegulationSKL(
        y_tra, X_tra, y_val, X_val, index_no)

    clf = skdisa.QuadraticDiscriminantAnalysis()

    clf.fit(X_tra, y_tra)
    return processLearning(clf, X_tra, y_tra, X_val, y_val)
 def calc_fitness(self, data, target):
     if self.changed:
         nfolds = 4
         scores = np.zeros(nfolds)
         precision = np.zeros(nfolds)
         recall = np.zeros(nfolds)
         X = np.copy(data)
         for i in range(0, len(self.genome)):
             if self.genome[len(self.genome) - 1 - i] == 0:
                 X = np.delete(X, len(self.genome) - 1 - i, 1)
         i = 0
         skf = cross_validation.StratifiedKFold(n_splits=nfolds)
         for train, test in skf.split(X, target):
             if self.type == 'dt':
                 self.clf = tree.DecisionTreeClassifier(
                     criterion='entropy',
                     splitter='random').fit(X[train], target[train])
             elif self.type == 'svm':
                 self.clf = svm.SVC(kernel='linear').fit(
                     X[train], target[train])
             elif self.type == 'knn':
                 self.clf = knn.KNeighborsClassifier().fit(
                     X[train], target[train])
             elif self.type == 'lr':
                 self.clf = lm.LogisticRegression().fit(
                     X[train], target[train])
             elif self.type == 'nb':
                 self.clf = nb.GaussianNB().fit(X[train], target[train])
             elif self.type == 'rf':
                 self.clf = ens.RandomForestClassifier().fit(
                     X[train], target[train])
             elif self.type == 'et':
                 self.clf = ens.ExtraTreesClassifier().fit(
                     X[train], target[train])
             elif self.type == 'mlp':
                 self.clf = nn.MLPClassifier(
                     hidden_layer_sizes=(40,
                                         5)).fit(X[train], target[train])
             elif self.type == 'lda':
                 self.clf = da.LinearDiscriminantAnalysis().fit(
                     X[train], target[train])
             elif self.type == 'qda':
                 self.clf = da.QuadraticDiscriminantAnalysis().fit(
                     X[train], target[train])
             else:
                 self.clf = None
             p = self.clf.predict(X[test])
             scores[i] = metrics.accuracy_score(target[test], p)
             precision[i] = metrics.precision_score(target[test], p)
             recall[i] = metrics.recall_score(target[test], p)
             i += 1
         self.accuracy = scores.mean()
         self.std = scores.std()
         self.precision = precision.mean()
         self.recall = recall.mean()
         self.changed = False
示例#8
0
 def QDA(self, source):  #这个方法居然没在HTML里面????????????????????????????、
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = discriminant_analysis.QuadraticDiscriminantAnalysis(
         n_components=2)
     print(pca.covariance_)
     result = {}
     result['data'] = pca.fit_transform(data_source)
     result['params'] = 0
     return result
def deserialize_qda(model_dict):
    model = discriminant_analysis.QuadraticDiscriminantAnalysis(
        **model_dict['params'])

    model.means_ = np.array(model_dict['means_']).astype(np.float64)
    model.priors_ = np.array(model_dict['priors_']).astype(np.float64)
    model.scalings_ = np.array(model_dict['scalings_']).astype(np.float64)
    model.rotations_ = np.array(model_dict['rotations_']).astype(np.float64)
    model.classes_ = np.array(model_dict['classes_']).astype(np.int64)

    return model
示例#10
0
def getBestFeaturesForQDA(trainingData):
    x = trainingData.iloc[:, 0:11]
    y = trainingData.iloc[:, 11]
    bestFeatures = sfs(
        da.QuadraticDiscriminantAnalysis(),
        k_features="best",
        forward=False,
        floating=False,
        verbose=False,
        scoring='r2',
    ).fit(x, y)
    return bestFeatures.k_feature_names_, bestFeatures.k_feature_idx_
示例#11
0
def getBestFeaturesForHigherOrderTerms(trainingData, num_features):
    x = trainingData.loc[:, trainingData.columns != 'label']
    y = trainingData.loc[:, 'label']
    bestFeatures = sfs(
        da.QuadraticDiscriminantAnalysis(),
        k_features=num_features,
        forward=True,
        floating=False,
        verbose=2,
        scoring='r2',
    ).fit(x, y)
    return bestFeatures.k_feature_names_
示例#12
0
    def __init__(self, df, run_prefix, algs_name=None, seed=42):
        # code that will prepare the data
        y = df.PHENO
        X = df.drop(columns=['PHENO'])

        # Split the data
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=0.3, random_state=seed)  # 70:30
        IDs_train = X_train.ID
        IDs_test = X_test.ID
        X_train = X_train.drop(columns=['ID'])
        X_test = X_test.drop(columns=['ID'])

        # Saving the prepped data the other classes will need
        self.df = df
        self.run_prefix = run_prefix
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.IDs_train = IDs_train
        self.IDs_test = IDs_test

        # Where the results will be stored
        self.log_table = None
        self.best_algo = None
        self.algo = None
        self.rfe_df = None

        # The methods we will use
        if algs_name is None:
            self.algorithms = [
                linear_model.LogisticRegression(solver='lbfgs'),
                ensemble.RandomForestClassifier(n_estimators=100),
                ensemble.AdaBoostClassifier(),
                ensemble.GradientBoostingClassifier(),
                linear_model.SGDClassifier(loss='modified_huber'),
                svm.SVC(probability=True, gamma='scale'),
                neural_network.MLPClassifier(),
                neighbors.KNeighborsClassifier(),
                discriminant_analysis.LinearDiscriminantAnalysis(),
                discriminant_analysis.QuadraticDiscriminantAnalysis(),
                ensemble.BaggingClassifier(),
                xgboost.XGBClassifier()
            ]
        else:
            algorithms = []

            for algo_name in algs_name:
                algorithms.append(self.getAlgorithmFromName(algo_name))

            self.algorithms = algorithms
示例#13
0
def classification_models():
    """
    Classification Models
    """
    return {
        'kneighbors': neighbors.KNeighborsClassifier(),
        'svc_lin': svm.SVC(kernel='linear', probability=True),
        'svc_rbf': svm.SVC(probability=True),
        'svc_poly': svm.SVC(kernel='poly', degree=2, probability=True),
        'decision_tree': tree.DecisionTreeClassifier(),
        'random_forest': ensemble.RandomForestClassifier(),
        'adaboost': ensemble.AdaBoostClassifier(),
        'gaussian_nb': naive_bayes.GaussianNB(),
        'lin_da': discriminant_analysis.LinearDiscriminantAnalysis(),
        'quad_da': discriminant_analysis.QuadraticDiscriminantAnalysis()
    }
示例#14
0
def test_models(X, y, repeat_x):
    scores = pd.DataFrame(columns=['LogReg', 'LDA', 'QDA'])
    for i in range(0, repeat_x):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
        model_lr = linear_model.LogisticRegression()
        lr_score = model_fit_score(model_lr, X_train, X_test, y_train, y_test)

        model_lda = discriminant_analysis.LinearDiscriminantAnalysis()
        lda_score = model_fit_score(model_lda, X_train, X_test, y_train, y_test)

        model_qda = discriminant_analysis.QuadraticDiscriminantAnalysis()
        qda_score = model_fit_score(model_qda, X_train, X_test, y_train, y_test)

        i_test_run = pd.DataFrame([[lr_score, lda_score, qda_score]], columns=['LogReg', 'LDA', 'QDA'])
        scores = scores.append(i_test_run, ignore_index=True)
    return scores
示例#15
0
    def getAlgorithmFromName(self, alg_name):
        algo = None
        if alg_name == 'LogisticRegression':
            algo = linear_model.LogisticRegression(solver='lbfgs')

        elif alg_name == 'RandomForestClassifier':
            algo = ensemble.RandomForestClassifier(n_estimators=100)

        elif alg_name == 'AdaBoostClassifier':
            algo = ensemble.AdaBoostClassifier()

        elif alg_name == 'GradientBoostingClassifier':
            algo = ensemble.GradientBoostingClassifier()

        elif alg_name == 'SGDClassifier':
            algo = linear_model.SGDClassifier(loss='modified_huber')

        elif alg_name == 'SVC':
            algo = svm.SVC(probability=True, gamma='scale')

        elif alg_name == 'MLPClassifier':
            algo = neural_network.MLPClassifier()

        elif alg_name == 'KNeighborsClassifier':
            algo = neighbors.KNeighborsClassifier()

        elif (alg_name == 'LinearDiscriminantAnalysis'):
            algo = discriminant_analysis.LinearDiscriminantAnalysis()

        elif alg_name == 'QuadraticDiscriminantAnalysis':
            algo = discriminant_analysis.QuadraticDiscriminantAnalysis()

        elif alg_name == 'BaggingClassifier':
            algo = ensemble.RandomForestClassifier(n_estimators=100)

        elif alg_name == 'ComplementNB':
            algo = ensemble.BaggingClassifier()

        elif alg_name == 'XGBClassifier':
            algo = xgboost.XGBClassifier()

        else:
            sys.exit('Algorithm name ' + alg_name +
                     ' incorrect, please check it')

        return algo
示例#16
0
def ModelSelection(test_data, features, label):
    MLA = [
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),
        gaussian_process.GaussianProcessClassifier(),
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),
        neighbors.KNeighborsClassifier(),
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

    MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score']
    MLA_compare = pd.DataFrame(columns=MLA_columns)
    x_train, x_test, y_train, y_test = train_test_split(train_data[features],
                                                        train_data[label],
                                                        test_size=0.2)
    row_index = 0
    MLA_predict = train_data[label]
    for alg in MLA:

        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        alg.fit(x_train, y_train)
        MLA_predict[MLA_name] = alg.predict(x_test)
        MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test)
        row_index += 1

    MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True)
    return MLA_compare, x_train, x_test, y_train, y_test
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler()),
            ('select', feature_selection.SelectPercentile()),
            ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis()),
        ])

        param_grid = [{
            'scale__with_mean': [True],
            'scale__with_std': [True],

            #'select__percentile': [i for i in range(40, 81, 3)],
            'select__percentile': [i for i in range(40, 51)],
            'select__score_func': [
                feature_selection.f_classif,
                feature_selection.mutual_info_classif
            ],

            'estim__reg_param': [0.1 + 0.025 * i for i in range(-1, 2)]
        }]

        grid = model_selection.GridSearchCV(
            pipe, cv=9, n_jobs=16, param_grid=param_grid, verbose=1,
            scoring=metrics.make_scorer(metrics.accuracy_score),
        )
        grid.fit(x, y)

        print('Optimal Hyperparametres:')
        print('=======================')
        for step in grid.best_estimator_.steps:
            print(step)
        print("CV Score:", grid.best_score_)

        estimator = pipe.named_steps['estim']
        if hasattr(estimator, 'transduction_'):
            self._transduction = estimator.transduction_
        self._model = grid.predict
示例#18
0
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop',
             transformers.ColumnDropper(columns=(6, 7, 8, 11, 12, 13, 14))),
            #('select', feature_selection.SelectKBest()),
            ('scale', preprocessing.StandardScaler()),
            ('expand', preprocessing.PolynomialFeatures()),
            ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis()),
        ])

        param_grid = [{
            #'select__k': [i for i in range(15, 21)],
            #'select__score_func': [feature_selection.f_classif],
            'scale__with_mean': [True, False],
            'scale__with_std': [True],
            'expand__include_bias': [False, True],
            'expand__interaction_only': [False, True],
            'expand__degree': [1, 2]

            #'estim__reg_param': [0.5]
            #'estim__alpha': list(0.001 + 1 * i for i in range(0, 5))
        }]

        grid = model_selection.GridSearchCV(
            pipe,
            cv=10,
            n_jobs=1,
            param_grid=param_grid,
            verbose=1,
            scoring=metrics.make_scorer(metrics.accuracy_score),
        )
        grid.fit(x, y)

        print('Optimal Hyperparametres:')
        print('=======================')
        for step in grid.best_estimator_.steps:
            print(step)
        print("CV Score:", grid.best_score_)

        self._model = grid.predict
def all_classifiers():
    # Model Data
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),  # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        XGBClassifier()
    ]
    return MLA
示例#20
0
 def __init__(self,
              type="linear_regression",
              regularization=False,
              n_estimators=100,
              subsample=1.0,
              max_depth=3,
              c=80,
              e=0.001):
     if type == "linear_regression":
         self.model = linear_model.LinearRegression(normalize=True)
     elif type == "ridge":
         self.model = linear_model.Ridge()
     elif type == "SVM":
         self.model = svm.SVR(kernel='rbf', gamma='auto', C=c, epsilon=e)
     elif type == 'XGBoost':
         self.model = ensemble.GradientBoostingRegressor(
             n_estimators=n_estimators,
             subsample=subsample,
             max_depth=max_depth)
     elif type == 'BaggingRegressor':
         self.model = ensemble.BaggingRegressor()
     elif type == 'RandomForest':
         self.model = ensemble.RandomForestRegressor(
             n_estimators=n_estimators, max_depth=max_depth)
     elif type == "AdaBoostRegressor":
         self.model = ensemble.AdaBoostRegressor(n_estimators=n_estimators)
     elif type == 'ExtraTreesRegressor':
         self.model = ensemble.ExtraTreesRegressor(
             n_estimators=n_estimators, max_depth=max_depth)
     elif type == 'Lasso':
         self.model = linear_model.Lasso()
     elif type == "qda":
         self.model = discriminant_analysis.QuadraticDiscriminantAnalysis()
     elif type == "lda":
         self.model = discriminant_analysis.LinearDiscriminantAnalysis()
     elif type == 'XGBoost with Bagging':
         self.model = ensemble.BaggingRegressor(
             base_estimator=ensemble.GradientBoostingRegressor(
                 n_estimators=100, subsample=1.0, max_depth=3),
             n_estimators=n_estimators)
     elif type == "Gaussian Process":
         self.model = gaussian_process.GaussianProcessRegressor()
示例#21
0
    def __init__(self, df, run_prefix, max_iter, cv_count):
        self.run_prefix = run_prefix
        self.max_iter = max_iter
        self.cv_count = cv_count

        self.y_tune = df.PHENO
        self.IDs_tune = df.ID
        self.X_tune = df.drop(columns=['PHENO', 'ID'])

        best_algo_name_in = run_prefix + '.best_algorithm.txt'
        best_algo_df = pd.read_csv(best_algo_name_in,
                                   header=None,
                                   index_col=False)
        self.best_algo = str(best_algo_df.iloc[0, 0])

        self.algorithms = [
            linear_model.LogisticRegression(),
            ensemble.RandomForestClassifier(),
            ensemble.AdaBoostClassifier(),
            ensemble.GradientBoostingClassifier(),
            linear_model.SGDClassifier(loss='modified_huber'),
            svm.SVC(probability=True),
            neural_network.MLPClassifier(),
            neighbors.KNeighborsClassifier(),
            discriminant_analysis.LinearDiscriminantAnalysis(),
            discriminant_analysis.QuadraticDiscriminantAnalysis(),
            ensemble.BaggingClassifier(),
            xgboost.XGBClassifier()
        ]
        self.log_table = None
        self.best_algo_name_in = None
        self.best_algo_df = None
        self.hyperparameters = None
        self.scoring_metric = None
        self.cv_tuned = None
        self.cv_baseline = None
        self.algo = None
        self.searchCVResults = None
        self.rand_search = None
        self.algo_tuned = None
        self.tune_out = None
    def _train(self):
        x = self._train_features
        y = self._train_outputs

        pipe = pipeline.Pipeline([
            ('drop', transformers.ColumnDropper(
                columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124)
            )),
            ('scale', preprocessing.StandardScaler(
                with_mean=True,
                with_std=True
            )),
            ('select', feature_selection.SelectPercentile(
                percentile=46,
                score_func=feature_selection.mutual_info_classif
            )),
            ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis(
                reg_param=0.1
            ))
        ])

        pipe.fit(x, y)
        self._model = pipe.predict
示例#23
0
def testPCAOnDifferentClassifiers():
    qda = da.QuadraticDiscriminantAnalysis()
    trainingX, trainingY, testingX, testingY = getPCATraingAndTesting(featurePercentageThreshold)
    qda.fit(trainingX, trainingY)

    score = qda.score(testingX, testingY)
    print(f'QDA score: {score}')

    rfc = RandomForestClassifier(n_estimators=500)
    rfc.fit(trainingX, trainingY)

    score = rfc.score(testingX, testingY)
    print(f'RandomForests: {score}')

    supportClf = svm.LinearSVC()
    supportClf.fit(trainingX, trainingY)
    score = supportClf.score(testingX, testingY)
    print(f'SVC Score: {score}')

    kNeighbor = KNeighborsClassifier()
    kNeighbor.fit(trainingX, trainingY)
    score = kNeighbor.score(testingX, testingY)
    print(f'KNearestNeighbors Score: {score}')
示例#24
0
########################################################################################
from sklearn import decomposition, discriminant_analysis

# pon el path correcto, el archivo está en repo/umucv/data
mnist = np.load("../../../data/mnist.npz")
xl, yl, xt, yt = [mnist[d] for d in ['xl', 'yl', 'xt', 'yt']]
cl = np.argmax(yl, axis=1)
ct = np.argmax(yt, axis=1)

transformer = decomposition.PCA(n_components=40).fit(xl)

xrl = transformer.transform(xl)
xrt = transformer.transform(xt)

maq = discriminant_analysis.QuadraticDiscriminantAnalysis(
    store_covariance=True).fit(xrl, cl)

print((maq.predict(xrt) == ct).mean())


def classifyG(xs):
    t = np.array(xs).reshape(-1, 28 * 28)
    p = maq.predict_proba(transformer.transform(t))
    r = np.argmax(p, axis=1)
    pm = np.max(p, axis=1)
    return r, pm


########################################################################################

# elegimos la red convolucional
示例#25
0
from sklearn import discriminant_analysis
from sklearn import tree
from sklearn import neighbors
from sklearn.metrics import accuracy_score

#Data and labels [height, weight, shoe size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39],
     [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]]
Y = ['male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male']

#Classifiers
clf1 = discriminant_analysis.QuadraticDiscriminantAnalysis()
clf2 = tree.DecisionTreeClassifier()
clf3 = neighbors.KNeighborsClassifier()

#Train Model
clf1 = clf1.fit(X,Y)
clf2 = clf2.fit(X,Y)
clf3 = clf3.fit(X,Y)

_X=[[184,84,44],[198,92,48],[183,83,44],[166,47,36],[170,60,38],[172,64,39],[182,80,42],[180,80,43]]
_Y=['male','male','male','female','female','female','male','male']

#Prediction
prediction1 = clf1.predict(_X)
prediction2 = clf2.predict(_X)
prediction3 = clf3.predict(_X)

#Result
r1 = accuracy_score(_Y,prediction1)
r2 = accuracy_score(_Y,prediction2)
示例#26
0
def train_qda(allData):
    Y = np.array(allData['label'])
    X = np.array(allData.loc[:, allData.columns != 'label'])
    clf = da.QuadraticDiscriminantAnalysis()
    clf.fit(X, Y)
    return clf
示例#27
0
    # bestFeaturesQda = train_qda(bestFeaturesTrainingData)

    # testQda(bestFeaturesQda, bestFeaturesTestingData, "With forward subset selection")

    # multDf = pd.read_csv(os.path.dirname(os.path.abspath(__file__))+'/data/TrainData_Multiplicative.csv')
    # multTraining, multTesting = partionData(multDf, .8)

    # bestFeatures = getBestFeaturesForHigherOrderTerms(multTraining, 11)
    # #bestFeatures = list(['volatile acidity*pH*', 'density*alcohol*', 'volatile acidity*citric acid*pH*', 'volatile acidity*density*sulphates*', 'free sulfur dioxide*pH*alcohol*', 'volatile acidity*total sulfur dioxide*density*sulphates*', 'citric acid*residual sugar*density*sulphates*alcohol*'])
    # bestDfX = multTraining.loc[:,bestFeatures]
    # trainingY = multTraining['label']
    # bestDfX.insert(loc = len(bestDfX.columns),column='label', value=trainingY)
    # bestFeaturesQda = train_qda(bestDfX)

    # testingY = multTesting.loc[:,'label']
    # bestDfTesting = multTesting.loc[:, bestFeatures]
    # bestDfTesting.insert(loc = len(bestDfTesting.columns),column='label', value=testingY)

    # testQda(bestFeaturesQda,bestDfTesting,f'Testing with labels {bestFeatures}')

    # print(f'Test\n {bestDfTesting}\nTestY\n{trainingY}')

    #Run QDA on PCA data
    qda = da.QuadraticDiscriminantAnalysis()
    trainingX, trainingY, testingX, testingY = PCA.getPCATraingAndTesting(.95)
    qda.fit(trainingX, trainingY)

    score = qda.score(testingX, testingY)
    print(score)
示例#28
0
def run_param_search(model_name, X, y, scale_data=True):
    if scale_data:
        X = scale(X)

    if model_name == 'LogisticRegression':
        model = linear_model.LogisticRegression()
        params = {
            'penalty': ['l1', 'l2'],
            'C': stats.lognorm(s=3),
        }

    if model_name == 'LDA':
        model = discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr')
        params = {
            'shrinkage': stats.uniform(loc=0, scale=1)
        }

    if model_name == 'QDA':
        model = discriminant_analysis.QuadraticDiscriminantAnalysis()
        params = {
            'reg_param': stats.uniform(loc=0, scale=1)
        }

    if model_name == 'SVM':
        # the polynomial kernel appears to be numerically unstable, and I
        # could not comsistently get if to work
        model = svm.SVC()
        params = {
            'C': stats.lognorm(s=2),
            'kernel': ['rbf', 'sigmoid'],
        }

    if model_name == 'AdaBoost':
        model = ensemble.AdaBoostClassifier()
        params = {
            'n_estimators': stats.randint(low=100, high=1500),
            'learning_rate': stats.uniform(loc=0.5, scale=0.5),
        }

    if model_name == 'GradientBoosting':
        model = ensemble.GradientBoostingClassifier()
        params = {
            'n_estimators': stats.randint(low=100, high=1500),
            'learning_rate': stats.uniform(loc=0.05, scale=0.95),
            'max_depth': stats.randint(low=3, high=8),
            'subsample': stats.uniform(loc=0.5, scale=0.5),
        }

    if model_name == 'RandomForest':
        model = ensemble.RandomForestClassifier()
        params = {
            'n_estimators': stats.randint(low=100, high=1000),
            'max_features': stats.randint(low=1, high=12),
            'min_samples_leaf': stats.randint(low=1, high=10),
        }

    if model_name == 'ExtraTrees':
        model = ensemble.ExtraTreesClassifier()
        params = {
            'n_estimators': stats.randint(low=100, high=1000),
            'max_features': stats.randint(low=1, high=12),
            'min_samples_leaf': stats.randint(low=1, high=10),
        }

    param_search = (model_selection
                    .RandomizedSearchCV(estimator=model,
                                        param_distributions=params,
                                        n_iter=200,
                                        cv=10,
                                        n_jobs=4,
                                        return_train_score=True,
                                        verbose=1))

    param_search.fit(X, y)
    best_param_indices = np.argsort(-param_search
                                    .cv_results_['mean_test_score'])[0:10]

    return (param_search, best_param_indices)
示例#29
0
 def get_skl_estimator(self, **default_parameters):
     return discriminant_analysis.QuadraticDiscriminantAnalysis(
         **default_parameters)
示例#30
0

MLA = [    
        # Generalized Linear Models
        LogisticRegressionCV(),
    
        # SVM
        svm.SVC(probability = True),
        svm.LinearSVC(),
    
        # KNN
        neighbors.KNeighborsClassifier(weights='distance'),
    
        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),
     
        # Naive Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),
    
        #Trees    
        tree.DecisionTreeClassifier(),
    
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier()