Пример #1
0
def instanciate_estimators(clf_type, y=None, **kw):
    if clf_type in ['binary-clf']:
        print(('Fraction by class: True: %0.2f; False: %0.2f'
               % (list(y).count(True) / len(y),
                  list(y).count(False) / len(y))))
        cw = 'balanced'
        clfs = [
                # linear_model.LogisticRegressionCV(
                #     class_weight=cw, max_iter=100,
                #     penalty='l2', n_jobs=1),
                linear_model.RidgeClassifierCV(
                    class_weight=cw, cv=3),
                ensemble.GradientBoostingClassifier(
                   n_estimators=100),
                # ensemble.RandomForestClassifier(
                #     n_estimators=100, class_weight=cw)
                # neural_network.MLPClassifier(
                #     hidden_layer_sizes=(100,)),
                # NNetBinaryClassifier(**kw)
                # waiting for data preprocessing to get configs
                ]

    elif clf_type in ['multiclass-clf']:
        print('fraction of the most frequent class:',
              max([list(y).count(x)
                   for x in set(list(y))]) / len(list(y)))
        clfs = [
                # linear_model.LogisticRegressionCV(
                #     max_iter=100, penalty='l2', n_jobs=1),
                linear_model.RidgeClassifierCV(cv=3),
                ensemble.GradientBoostingClassifier(
                    n_estimators=100),
                # ensemble.RandomForestClassifier(
                #     n_estimators=100),
                # neural_network.MLPClassifier(hidden_layer_sizes=(100,)),
                # NNetMultiClassifier(**kw)
                ]
    elif clf_type in ['regression']:
        clfs = [
                linear_model.RidgeCV(cv=3),
                ensemble.GradientBoostingRegressor(
                    n_estimators=100),
                ensemble.RandomForestRegressor(
                    n_estimators=100)
                # neural_network.MLPRegressor(hidden_layer_sizes=(100,))
                # NNetRegressor(**kw)
                # waiting for data preprocessing to get configs
                ]
    else:
        raise ValueError("{} not recognized".format(clf_type))
    return clfs
Пример #2
0
def get_algorithms():
    MLA_dict = {
        # Ensemble methods
        "ada": ensemble.AdaBoostClassifier(),
        "bc": ensemble.BaggingClassifier(),
        "etc": ensemble.ExtraTreesClassifier(),
        "gbc": ensemble.GradientBoostingClassifier(),
        "rfc": ensemble.RandomForestClassifier(),
        # Gaussian processes
        "gpc": gaussian_process.GaussianProcessClassifier(),
        # Linear models
        "lr": linear_model.LogisticRegressionCV(),
        "pac": linear_model.PassiveAggressiveClassifier(),
        "rcc": linear_model.RidgeClassifierCV(),
        "sgd": linear_model.SGDClassifier(),
        "per": linear_model.Perceptron(),
        # Navies bayes
        "bnb": naive_bayes.BernoulliNB(),
        "gnb": naive_bayes.GaussianNB(),
        # Nearest neighbour
        "knn": neighbors.KNeighborsClassifier(),
        # SVM
        "svc": svm.SVC(probability=True),
        "nvc": svm.NuSVC(probability=True),
        "lvc": svm.LinearSVC(),
        # Trees
        "dtc": tree.DecisionTreeClassifier(),
        "ets": tree.ExtraTreeClassifier(),
        # Discriminant analysis
        "lda": discriminant_analysis.LinearDiscriminantAnalysis(),
        "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(),
    }
    return MLA_dict
Пример #3
0
def test(sdata, classifier=None, verbose=True, verboseverbose=False):
    digits = sdata

    X_digits = digits.data
    y_digits = digits.target

    n_samples = len(X_digits)

    # data
    X_train = X_digits[:.85 * n_samples]
    y_train = y_digits[:.85 * n_samples]

    # truths/target
    X_test = X_digits[.85 * n_samples:]
    y_test = y_digits[.85 * n_samples:]

    if not classifier:
        classifier = linear_model.RidgeClassifierCV()

    classifier_fit = classifier.fit(X_train, y_train)

    pred = classifier_fit.predict(X_test)
    score = classifier_fit.score(X_test, y_test)

    if verboseverbose:
        # print the matrix of feature scores
        big_matrix = np.array([ np.hstack((X_test[i], y_test[i])) for i in range(len(X_test)) ])
        print(['Tr0Rhyt','Tr0TopL','Tr1Rhyt','Tr1TopL','Truth'])
        print(big_matrix)
    if verbose:
        print('TRUTH:', y_test)
        print('PREDN:', pred)
        print('Classifier score: %f' % score)

    return score, pred, y_test
Пример #4
0
def test_sk_RidgeClassifierCV():
    print("Testing sklearn, RidgeClassifierCV...")
    mod = linear_model.RidgeClassifierCV()
    X, y = iris_data
    mod.fit(X, y)
    docs = {'name': "RidgeClassifierCV test"}
    fv = X[0, :]
    upload(mod, fv, docs)
def ridge_learn(scale_param,
                dim,
                depth,
                data,
                labels,
                CV=None,
                reg=[np.array((0.1, 1, 10))],
                cpu_number=1):
    """
    scale_param is a float, dim and depth are positive integers, data is a list
    of numpy arrays with each array having the shape of an esig stream2sig
    output for a stream of dimension dim truncated to level depth, labels is a
    list, same length as data list, of integers, CV determines the
    cross-validation splitting strategy of a sklearn GridSearchCV and can be any
    of the allowed options for this (deault is None), reg is a numpy array of
    floats (its default is numpy.array((0.1,1.0,10.0))), and cpu_number is an
    integer (its default value is 1).
    
    The entries in the data list are scaled via the sig_scale_depth_ratio
    function, i.e. via sig_scale_depth_ratio(data, dim, depth,
    scalefactor=scale_param), and cpu_number number of cpus are used for
    parallelisation.
    
    Once scaled, a sklearn GridSearchCV is run with the model set to be
    RidgeClassifierCV(), the param_grid to be {'alphas':reg} and the
    cross-validation strategy to be determined by CV. 
    
    The selected best model is used to predict the labels for the appropriately
    scaled data, and the accuracy_score of the predicted labels compared to the
    actual labels is computed.
    
    The returned output is a list composed of the scale_param used, the model
    selected during the GridSearch, and the accuracy_score achieved by the
    selected model.
    """

    if depth == 0:
        return print(
            "Error: Depth 0 term of signature is always 1 and will not change under scaling"
        )
    if dim == 1:
        return print("Error: One-dimensionl signatures are trivial")
    else:
        ridge = linear_model.RidgeClassifierCV()
        tuned_params = {'alphas': reg}
        Q = Parallel(n_jobs=cpu_number)([
            delayed(sig_scale_depth_ratio)(data[k], dim, depth, scale_param)
            for k in range(len(data))
        ])
        model = GridSearchCV(estimator=ridge,
                             param_grid=tuned_params,
                             cv=CV,
                             n_jobs=cpu_number)
        model.fit(Q, labels)
        best_model = model.best_estimator_
        preds = best_model.predict(Q)
        acc = accuracy_score(preds, labels)
        return scale_param, best_model, acc
Пример #6
0
def train(volt_values, target_output, split_ratio=0.2):
    """
    function to train a simple linear regression to fit the snapshot of membrane potential (state matrix) to binary classification
    using a ridge regression with cross-validation for regularization parameter.
    :param volt_values: np.arr, shape: len.of stim. presentation x N_E.
    snapshots of membrane potential at each stimuli offset.
    :param target_output: np.arr, shape: num. of stimuli x len. of stim. presentation. @sym_seq in the main.py
    :param split_ratio: float, percentage of the data to be used for the test
    :return: list, saves the score for each module
    """
    scores = np.zeros(
        module_depth)  # array to save accuracy score for each module
    MSE = np.zeros(module_depth)
    for mod_i in range(module_depth):
        # split the data into training and test sets
        # x_train dim: #train_sample(#screenshots) x #features(#neurons)
        # y_train dim: #train_sample * #classes(stimuli)
        print(np.transpose(np.int_(target_output)).shape)
        x_train, x_test, y_train, y_test = train_test_split(
            np.transpose(volt_values[mod_i, :]),  # for each module
            np.transpose(np.int_(target_output)),
            test_size=split_ratio,
        )

        # linear ridge regression with cross-validation for regularization parameter
        # deltas = [0.01, 0.1, 1, 10, 100]  # regularization parameter
        deltas = [1e0, 1e3, 1e4, 2e4, 5e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10]
        fit_model = lm.RidgeClassifierCV(alphas=deltas,
                                         fit_intercept=True,
                                         store_cv_values=True).fit(X=x_train,
                                                                   y=y_train)

        # use the trained weight to predict the class of @y_test. Use WTA operation, without giving confidence level
        # predicted dim: 1 x #test sample. Each element consists indices of predicted class.
        predicted = fit_model.predict(
            x_test
        )  # dim: sample num x 1. Each entry indicates that n-th class is predicted.
        sum = 0  # count how many samples of y_test gets classified correctly
        for sample_index, class_predicted in enumerate(predicted):
            sum += y_test[sample_index,
                          class_predicted]  # entry of y_test are 0 and 1
        scores[mod_i] = (
            sum / y_test.shape[0]
        )  # normalize to 1 and save the accuracy for each module
        # print("weights: ", fit_model.coef_[:4, :10])
        # print("intercepts: ", fit_model.intercept_[:10])
        print("reg.params.: ", fit_model.alpha_)  # shit doesn't work

        # MSE
        deltaindex = np.where(deltas == fit_model.alpha_)[
            0]  # pick delta which is actually chosen
        MSE[mod_i] = np.mean(fit_model.cv_values_[:, :, deltaindex],
                             axis=(0, 1,
                                   2))  # average over all samples & feats.
    return scores, MSE
 def test_model_ridge_classifier_cv_bool(self):
     model, X = fit_classification_model(
         linear_model.RidgeClassifierCV(), 2, is_bool=True)
     model_onnx = convert_sklearn(
         model, "binary ridge classifier cv",
         [("input", BooleanTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X, model, model_onnx,
         basename="SklearnRidgeClassifierCVBool")
Пример #8
0
	def PrepareModel(self, savedmodel = None):
		
		if savedmodel != None:
			self.clf = savedmodel
		else:
			if self.mlmethod==Constants.MACHINE_LEARNING_METHOD_REGRESSION: 
				self.clf=linear_model.RidgeCV(alphas=self.alphas)
			elif self.mlmethod==Constants.MACHINE_LEARNING_METHOD_CLASSIFICATION:
				self.clf=linear_model.RidgeClassifierCV(alphas=self.alphas)		
			
			self.clf.fit(self.traindata ,self.trainlabel)
 def test_model_ridge_classifier_cv_multi_class(self):
     model, X = fit_classification_model(
         linear_model.RidgeClassifierCV(), 5)
     model_onnx = convert_sklearn(
         model, "multi-class ridge classifier cv",
         [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X, model, model_onnx,
         basename="SklearnRidgeClassifierCVMulti")
 def test_model_ridge_classifier_cv_multilabel(self):
     model, X_test = fit_multilabel_classification_model(
         linear_model.RidgeClassifierCV(random_state=42))
     model_onnx = convert_sklearn(
         model,
         "scikit-learn RidgeClassifierCV",
         [("input", FloatTensorType([None, X_test.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X_test, model, model_onnx,
         basename="SklearnRidgeClassifierCVMultiLabel")
Пример #11
0
def cross_validated_estimators_tests():
    models = [
        linear_model.ElasticNetCV(),
        linear_model.LarsCV(),
        linear_model.LassoCV(),
        linear_model.LassoLarsCV(),
        linear_model.LogisticRegressionCV(),
        linear_model.OrthogonalMatchingPursuitCV(),
        linear_model.RidgeClassifierCV(),
        linear_model.RidgeCV()
    ]
    for model in models:
        cross_validated_estimators(model)
Пример #12
0
  def test_classification_bootstrap(self):
    ridge_class = linear_model.RidgeClassifier()
    ridge_class_cv = linear_model.RidgeClassifierCV()

    result = bootstrap.regression_bootstrap(
        data=self.data,
        target=self.class_target,
        regressor=ridge_class,
        regressor_cv=ridge_class_cv,
        verbose=False,
        bootstraps=5)

    self.assertIsInstance(result, pd.DataFrame)
    self.assertEqual(result.shape[1], self.data.shape[1]+1)
Пример #13
0
 def test_model_ridge_classifier_cv_bool(self):
     model, X = fit_classification_model(
         linear_model.RidgeClassifierCV(), 2, is_bool=True)
     model_onnx = convert_sklearn(
         model,
         "binary ridge classifier cv",
         [("input", BooleanTensorType([None, X.shape[1]]))],
     )
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnRidgeClassifierCVBool",
         allow_failure="StrictVersion(onnxruntime.__version__)"
                       " <= StrictVersion('0.2.1')",
     )
 def test_model_ridge_classifier_cv_multi_class(self):
     model, X = fit_classification_model(linear_model.RidgeClassifierCV(),
                                         5)
     model_onnx = convert_sklearn(
         model,
         "multi-class ridge classifier cv",
         [("input", FloatTensorType([None, X.shape[1]]))],
     )
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnRidgeClassifierCVMulti",
         allow_failure="StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
 def test_model_ridge_classifier_cv_multilabel(self):
     model, X_test = fit_multilabel_classification_model(
         linear_model.RidgeClassifierCV(random_state=42))
     model_onnx = convert_sklearn(
         model,
         "scikit-learn RidgeClassifierCV",
         [("input", FloatTensorType([None, X_test.shape[1]]))],
     )
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X_test,
         model,
         model_onnx,
         basename="SklearnRidgeClassifierCVMultiLabel",
         allow_failure="StrictVersion("
         "onnxruntime.__version__)<= StrictVersion('0.2.1')",
     )
Пример #16
0
def sklearn_liner_model_regressions(xTrain, xTest, yTrain, yTest):
    modelForConsideration: DataFrame = pd.DataFrame()
    LinerModels = \
        [
            linear_model.ARDRegression(), linear_model.BayesianRidge(), linear_model.ElasticNet(),
            linear_model.ElasticNetCV(),
            linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LarsCV(), linear_model.Lasso(),
            linear_model.LassoCV(), linear_model.LassoLars(), linear_model.LassoLarsCV(), linear_model.LassoLarsIC(),
            linear_model.LinearRegression(), linear_model.MultiTaskLasso(),
            linear_model.MultiTaskElasticNet(), linear_model.MultiTaskLassoCV(), linear_model.MultiTaskElasticNetCV(),
            linear_model.OrthogonalMatchingPursuit(),
            linear_model.OrthogonalMatchingPursuitCV(), linear_model.PassiveAggressiveClassifier(),
            linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(),
            linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.RidgeClassifier(),
            linear_model.RidgeClassifierCV(),
            linear_model.RidgeCV(), linear_model.SGDClassifier(), linear_model.SGDRegressor(),
            linear_model.TheilSenRegressor(),
            linear_model.enet_path(xTrain, yTrain),
            linear_model.lars_path(xTrain, yTrain), linear_model.lasso_path(xTrain, yTrain),
            # linear_model.LogisticRegression()
            # ,linear_model.LogisticRegressionCV(),linear_model.logistic_regression_path(xTrain, yTrain), linear_model.orthogonal_mp(xTrain, yTrain), linear_model.orthogonal_mp_gram(), linear_model.ridge_regression()
        ]
    for model in LinerModels:
        modelName: str = model.__class__.__name__
        try:
            # print(f"Preparing Model {modelName}")
            if modelName == "LogisticRegression":
                model = linear_model.LogisticRegression(random_state=0)
            model.fit(xTrain, yTrain)
            yTrainPredict = model.predict(xTrain)
            yTestPredict = model.predict(xTest)
            errorList = calculate_prediction_error(modelName, yTestPredict,
                                                   yTest, yTrainPredict,
                                                   yTrain)

            if errorList["Test Average Error"][0] < 30 and errorList[
                    "Train Average Error"][0] < 30:
                try:
                    modelForConsideration = modelForConsideration.append(
                        errorList)
                except (Exception) as e:
                    print(e)

        except (Exception, ArithmeticError) as e:
            print(f"Error occurred while preparing Model {modelName}")
    return modelForConsideration
Пример #17
0
def ModelSelection(test_data, features, label):
    MLA = [
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),
        gaussian_process.GaussianProcessClassifier(),
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),
        neighbors.KNeighborsClassifier(),
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

    MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score']
    MLA_compare = pd.DataFrame(columns=MLA_columns)
    x_train, x_test, y_train, y_test = train_test_split(train_data[features],
                                                        train_data[label],
                                                        test_size=0.2)
    row_index = 0
    MLA_predict = train_data[label]
    for alg in MLA:

        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        alg.fit(x_train, y_train)
        MLA_predict[MLA_name] = alg.predict(x_test)
        MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test)
        row_index += 1

    MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True)
    return MLA_compare, x_train, x_test, y_train, y_test
def all_classifiers():
    # Model Data
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),  # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        XGBClassifier()
    ]
    return MLA
Пример #19
0
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    #gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.LogisticRegression(C=1000, random_state=0,
                                    solver='liblinear'),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.BernoulliNB(),
    #naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
Пример #20
0
                                                    ).fit(self.tr_data, self.tr_label)
        return True
    
    #TODO: 其他模型没有调参
    def train_with_LassoCV(self):
        if self.tr_data == None or self.tr_label = None:
            print ("lack of train data or train label")
            return False
        self.model = linear_model.LassoCV().fit(self.tr_data, self.tr_label)
        return True
    
    def train_with_RidgeCV(self):
        if self.tr_data == None or self.tr_label = None:
            print ("lack of train data or train label")
            return False
        self.model = linear_model.RidgeClassifierCV().fit(self.tr_data, self.tr_label)
        return True

    def train_with_ElasticNetCV(self):
        if self.tr_data == None or self.tr_label = None:
            print ("lack of train data or train label")
            return False
        self.model = linear_model.MultiTaskElasticNetCV().fit(self.tr_data, self.tr_label)
        return True
        
    def set_default_params(self):
        self.params = {
            'penalty': 'l2',
            'C': 1.0,
            'solver':'lbfgs'
        }
Пример #21
0
# validate_score_clf(linear_model.PassiveAggressiveClassifier(max_iter=3000), 'linear_model.PassiveAggressiveClassifier-2000')
validate_score_clf(
    linear_model.PassiveAggressiveClassifier(max_iter=5000,
                                             early_stopping=True),
    'linear_model.PassiveAggressiveClassifier-earlyStopping')
validate_score_clf(linear_model.SGDClassifier(max_iter=800),
                   'linear_model.SGDClassifier800')
# validate_score_clf(linear_model.SGDClassifier(max_iter=1200), 'linear_model.SGDClassifier1200')
# validate_score_clf(linear_model.SGDClassifier(max_iter=3200), 'linear_model.SGDClassifier3200')
# # # validate_score_clf(linear_model.LarsCV(max_iter=1200), 'linear_model.LarsCV')
# # # validate_score_clf(linear_model.LassoLarsCV(max_iter=1200), 'linear_model.LassoLarsCV')
# # # validate_score_clf(linear_model.LassoCV(max_iter=1200), 'linear_model.LassoCV')
# # # validate_score_clf(linear_model.ElasticNetCV(max_iter=1200), 'linear_model.ElasticNetCV')
# validate_score_clf(linear_model.OrthogonalMatchingPursuitCV(), 'linear_model.OrthogonalMatchingPursuitCV')
# # validate_score_clf(ensemble.GradientBoostingClassifier(n_estimators=15, verbose=1), 'GradientBoostingClassifier')
validate_score_clf(linear_model.RidgeClassifierCV(class_weight='balanced'),
                   'linear_model.RidgeClassifierCV-balanced')
# validate_score_clf(linear_model.RidgeClassifierCV(), 'linear_model.RidgeClassifierCV')

validate_score_clf(ensemble.RandomForestClassifier(n_estimators=200),
                   'RandomForestClassifier')

# validate_score_clf(linear_model.LogisticRegressionCV(max_iter=550, Cs=np.geomspace(1e-1, 1e-7, 15), class_weight='balanced'), 'LogisticRegressionCV_maxiter550')

# validate_score_clf(linear_model.LogisticRegressionCV(max_iter=900, Cs=np.geomspace(1e-1, 1e-7, 15), class_weight='balanced'), 'LogisticRegressionCV_maxiter900')
validate_score_clf(
    linear_model.LogisticRegressionCV(max_iter=1000,
                                      Cs=np.geomspace(1e-1, 1e-7, 15)),
    'LogisticRegressionCV_imbalanced')

# clf = linear_model.LogisticRegressionCV(
Пример #22
0
def model_comparison(x, y, show=True):
    """ Copy from : https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy/notebook
	Compare with various machine learning model
    """
    from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
    from xgboost import XGBClassifier

    MLA = [
        #Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        #GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        #Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        #Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        #Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        XGBClassifier()
    ]

    #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
    #note: this is an alternative to train_test_split
    cv_split = model_selection.ShuffleSplit(n_splits=10,
                                            test_size=.3,
                                            train_size=.7,
                                            random_state=0)  # run model

    #create table to compare MLA metrics
    MLA_columns = [
        'MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean',
        'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD', 'MLA Time'
    ]

    MLA_compare = pd.DataFrame(columns=MLA_columns)

    #create table to compare MLA predictions
    MLA_predict = y.copy()

    #index through MLA and save performance to table
    row_index = 0

    for alg in MLA:
        #set name and parameters
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())

        #score model with cross validation:
        cv_results = model_selection.cross_validate(alg, x, y, cv=cv_split)

        MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
        MLA_compare.loc[
            row_index,
            'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
        MLA_compare.loc[
            row_index,
            'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
        #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
        MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results[
            'test_score'].std() * 3  #let's know the worst that can happen!

        #save MLA predictions - see section 6 for usage
        alg.fit(x, y)
        MLA_predict[MLA_name] = alg.predict(x)

        row_index += 1

    MLA_compare.sort_values(by=['MLA Test Accuracy Mean'],
                            ascending=False,
                            inplace=True)

    if show:
        plt.figure(figsize=(15, 6))
        sns.barplot(x='MLA Test Accuracy Mean',
                    y='MLA Name',
                    data=MLA_compare,
                    color='m')
        plt.show()

    return MLA_compare
Пример #23
0
        regression(linear_model.OrthogonalMatchingPursuitCV()),
        regression(linear_model.Ridge(random_state=RANDOM_SEED)),
        regression(linear_model.RidgeCV()),
        regression(linear_model.BayesianRidge()),
        regression(linear_model.ARDRegression()),
        regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)),
        regression(
            linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)),

        # Logistic Regression
        classification(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifierCV()),
        classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification_binary(linear_model.RidgeClassifierCV()),
        classification_binary(
            linear_model.SGDClassifier(random_state=RANDOM_SEED)),

        # Decision trees
        regression(tree.DecisionTreeRegressor(**TREE_PARAMS)),
        regression(tree.ExtraTreeRegressor(**TREE_PARAMS)),
        classification(tree.DecisionTreeClassifier(**TREE_PARAMS)),
Пример #24
0
CLF = [
    #Ensemble Methods
    ('ada', ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier())),
    ('bc', ensemble.BaggingClassifier()),
    ('etc', ensemble.ExtraTreesClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('xgbc', xgb.XGBClassifier(max_depth=3)),  # xgb.XGBClassifier()),    #
    ('rfc', ensemble.RandomForestClassifier(n_estimators=50)),

    #Gaussian Processes
    ('gpc', gaussian_process.GaussianProcessClassifier()),

    #GLM - remove linear models, since this is a classifier algorithm
    ('lr', linear_model.LogisticRegressionCV()),
    ('pac', linear_model.PassiveAggressiveClassifier()),
    ('rc', linear_model.RidgeClassifierCV()),
    ('sgd', linear_model.SGDClassifier()),
    ('pct', linear_model.Perceptron()),

    #Navies Bayes
    ('gnb', naive_bayes.GaussianNB()),

    #Nearest Neighbor
    ('knn', neighbors.KNeighborsClassifier(n_neighbors=3)),

    #SVM
    ('svc', svm.SVC(probability=True)),
    ('lsvc', svm.LinearSVC()),

    #Trees
    ('dtc', tree.DecisionTreeClassifier()),
Пример #25
0
def compare_algorithm(data, target):
    x_train, x_cross, y_train, y_cross = train_test_split(data, target)
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(max_iter=1000, tol=0.001),
        linear_model.Perceptron(max_iter=1000, tol=0.001),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        xgb.XGBClassifier()
    ]
    MLA_columns = []
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    row_index = 0
    for alg in MLA:
        predicted = alg.fit(x_train, y_train).predict(x_cross)
        fp, tp, th = roc_curve(y_cross, predicted)
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round(
            alg.score(x_train, y_train), 4)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round(
            alg.score(x_cross, y_cross), 4)
        MLA_compare.loc[row_index, 'MLA Precission'] = precision_score(
            y_cross, predicted)
        MLA_compare.loc[row_index,
                        'MLA Recall'] = recall_score(y_cross, predicted)
        MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp)
        row_index = row_index + 1

    MLA_compare.sort_values(by=['MLA Test Accuracy'],
                            ascending=False,
                            inplace=True)
    print(MLA_compare)
Пример #26
0
    .format(movement_required))
print('Decisions: {}'.format(list(decisions)))

###### Tree
clf_DTC = tree.DecisionTreeClassifier()
clf_DTC = clf_DTC.fit(x_train, y_scikit_train)

clf_ETC = tree.ExtraTreeClassifier()
clf_ETC = clf_ETC.fit(x_train, y_scikit_train)

###### Neighbhors
clf_KNC = neighbors.KNeighborsClassifier()
clf_KNC = clf_KNC.fit(x_train, y_scikit_train)

###### linear Model
clf_RCCV = linear_model.RidgeClassifierCV()
clf_RCCV = clf_RCCV.fit(x_train, y_scikit_train)

###### Ensemble
clf_RFC = ensemble.RandomForestClassifier()
clf_RFC = clf_RFC.fit(x_train, y_scikit_train)

clf_ETC_ens = ensemble.ExtraTreesClassifier()
clf_ETC_ens = clf_ETC_ens.fit(x_train, y_scikit_train)

clf_ABC = ensemble.AdaBoostClassifier()
clf_ABC = clf_ABC.fit(x_train, y_scikit_train)

clf_GBC = ensemble.GradientBoostingClassifier()
clf_GBC = clf_GBC.fit(x_train, y_scikit_train)
Пример #27
0
def get_regression_estimators(r, regression_models):
    if r == 'ARDRegression':
        regression_models[r] = linear_model.ARDRegression()
    elif r == 'BayesianRidge':
        regression_models[r] = linear_model.BayesianRidge()
    elif r == 'ElasticNet':
        regression_models[r] = linear_model.ElasticNet()
    elif r == 'ElasticNetCV':
        regression_models[r] = linear_model.ElasticNetCV()
    elif r == 'HuberRegressor':
        regression_models[r] = linear_model.HuberRegressor()
    elif r == 'Lars':
        regression_models[r] = linear_model.Lars()
    elif r == 'LarsCV':
        regression_models[r] = linear_model.LarsCV()
    elif r == 'Lasso':
        regression_models[r] = linear_model.Lasso()
    elif r == 'LassoCV':
        regression_models[r] = linear_model.LassoCV()
    elif r == 'LassoLars':
        regression_models[r] = linear_model.LassoLars()
    elif r == 'LassoLarsCV':
        regression_models[r] = linear_model.LassoLarsCV()
    elif r == 'LassoLarsIC':
        regression_models[r] = linear_model.LassoLarsIC()
    elif r == 'LinearRegression':
        regression_models[r] = linear_model.LinearRegression()
    elif r == 'LogisticRegression':
        regression_models[r] = linear_model.LogisticRegression()
    elif r == 'LogisticRegressionCV':
        regression_models[r] = linear_model.LogisticRegressionCV()
    elif r == 'MultiTaskElasticNet':
        regression_models[r] = linear_model.MultiTaskElasticNet()
    elif r == 'MultiTaskElasticNetCV':
        regression_models[r] = linear_model.MultiTaskElasticNetCV()
    elif r == 'MultiTaskLasso':
        regression_models[r] = linear_model.MultiTaskLasso()
    elif r == 'MultiTaskLassoCV':
        regression_models[r] = linear_model.MultiTaskLassoCV()
    elif r == 'OrthogonalMatchingPursuit':
        regression_models[r] = linear_model.OrthogonalMatchingPursuit()
    elif r == 'OrthogonalMatchingPursuitCV':
        regression_models[r] = linear_model.OrthogonalMatchingPursuitCV()
    elif r == 'PassiveAggressiveClassifier':
        regression_models[r] = linear_model.PassiveAggressiveClassifier()
    elif r == 'PassiveAggressiveRegressor':
        regression_models[r] = linear_model.PassiveAggressiveRegressor()
    elif r == 'Perceptron':
        regression_models[r] = linear_model.Perceptron()
    elif r == 'RANSACRegressor':
        regression_models[r] = linear_model.RANSACRegressor()
    elif r == 'Ridge':
        regression_models[r] = linear_model.Ridge()
    elif r == 'RidgeClassifier':
        regression_models[r] = linear_model.RidgeClassifier()
    elif r == 'RidgeClassifierCV':
        regression_models[r] = linear_model.RidgeClassifierCV()
    elif r == 'RidgeCV':
        regression_models[r] = linear_model.RidgeCV()
    elif r == 'SGDClassifier':
        regression_models[r] = linear_model.SGDClassifier()
    elif r == 'SGDRegressor':
        regression_models[r] = linear_model.SGDRegressor()
    elif r == 'TheilSenRegressor':
        regression_models[r] = linear_model.TheilSenRegressor()
    else:
        print(
            r +
            " is an unsupported regression type. Check if you have misspelled the name."
        )
Пример #28
0
    predicts = model.predict(test_features)
    print('Test result:')
    print(classification_report(test_labels, predicts, labels=[1]))


if __name__ == "__main__":
    worddir = load_worddir(WORDPATH)
    features, labels = get_dataset()
    print("Numbers of train data: %d" % (len(features)))
    cnt = 0
    for label in labels:
        cnt += (label == 1)
    print("Numbers of positive data: %d" % (cnt))

    test_features, test_labels = get_dataset(train=False)
    print("Numbers of test data: %d" % (len(test_features)))

    models = [
        linear_model.RidgeClassifierCV(normalize=True),
        linear_model.LogisticRegressionCV(n_jobs=-1),
        tree.DecisionTreeClassifier(criterion='entropy'),
        ensemble.RandomForestClassifier(n_jobs=-1),
        svm.SVC(kernel='rbf')
    ]

    for model in models:
        print("start", model)
        cross_validation(model, features, labels)
        model.fit(features, labels)
        test_score(model, test_features, test_labels)
        print("end", model)
Пример #29
0
f2 = open('test_set', 'r')
J = f2.readlines()
for j in range(0, len(J)):
    J[j] = J[j].rstrip('\n')
f2.close()

features_train = []
labels_train = []

for filename in I:
    file = open(directory + filename, 'r')
    examples = get_features_labels(get_examples(file))
    for ex in examples:
        features_train.append(ex[0])
        labels_train.append(ex[1])

classifiers = [
    linear_model.RidgeClassifierCV(
        normalize=True),  # Linear Regression (Ridge regression)
    linear_model.LogisticRegressionCV(),  # Logistic Regression
    tree.DecisionTreeClassifier(criterion='entropy'),  # Decision Tree
    ensemble.RandomForestClassifier(),  # Random Forest
    svm.SVC(kernel='rbf')  # SVM
]

for clf in classifiers:
    clf.fit(features_train, labels_train)
    cross_validation(clf, features_train, labels_train)
    test_score(clf, J)
Пример #30
0
def main():
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    combine = [train_df, test_df]

    for df in combine:
        df.info()
        standardize_data(df)
        create_columns(df)
        create_bins(df)
        encode_data(df)
    # Define target (Y variable)
    target = ["Survived"]

    # Define features (X variables)
    train_df_x = [
        "Pclass",
        "Sex",
        "Age",
        "SibSp",
        "Parch",
        "Fare",
        "Embarked",
        "FamilySize",
        "IsAlone",
        "Title",
    ]

    # Define numerical features (binned and encoded)
    train_df_x_bin = [
        "Pclass",
        "Sex_Code",
        "AgeBin_Code",
        "FareBin_Code",
        "Embarked_Code",
        "FamilySize",
        "IsAlone",
        "Title_Code",
    ]

    # Analyze feature correlation with target
    for x in train_df_x:
        if train_df[x].dtype != "float64":
            print(train_df[[x, target[0]]].groupby(x).mean())

    # Graph individual features by survival
    fig, axis = plt.subplots(1, 3, figsize=(9, 6))
    sns.histplot(x="Fare",
                 data=train_df,
                 hue="Survived",
                 multiple="stack",
                 ax=axis[0])
    sns.histplot(x="Age",
                 data=train_df,
                 hue="Survived",
                 multiple="stack",
                 ax=axis[1])
    sns.histplot(x="FamilySize",
                 data=train_df,
                 hue="Survived",
                 multiple="stack",
                 ax=axis[2])

    fig, axis = plt.subplots(2, 3, figsize=(16, 12))
    sns.barplot(x="Pclass", y="Survived", data=train_df, ax=axis[0, 0])
    sns.barplot(x="Sex", y="Survived", data=train_df, ax=axis[0, 1])
    sns.barplot(x="Embarked", y="Survived", data=train_df, ax=axis[0, 2])
    sns.barplot(x="IsAlone", y="Survived", data=train_df, ax=axis[1, 0])
    sns.barplot(x="Title", y="Survived", data=train_df, ax=axis[1, 1])

    # Compare class with a 2nd feature
    fig, axis = plt.subplots(1, 3, figsize=(9, 6))
    sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Sex", ax=axis[0])
    sns.barplot(x="Pclass",
                y="Survived",
                data=train_df,
                hue="IsAlone",
                ax=axis[1])
    sns.barplot(x="Pclass",
                y="Survived",
                data=train_df,
                hue="Embarked",
                ax=axis[2])

    # Compare Sex with a 2nd feature
    fig, axis = plt.subplots(1, 3, figsize=(9, 6))
    sns.barplot(x="Sex", y="Survived", data=train_df, hue="Pclass", ax=axis[0])
    sns.barplot(x="Sex",
                y="Survived",
                data=train_df,
                hue="IsAlone",
                ax=axis[1])
    sns.barplot(x="Sex",
                y="Survived",
                data=train_df,
                hue="Embarked",
                ax=axis[2])

    # Correlation heatmap of dataset
    fig, ax = plt.subplots(figsize=(14, 12))
    fig = sns.heatmap(
        train_df.corr(),
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        annot=True,
        ax=ax,
    )

    # Machine Learning Algorithm (MLA) selection and initialization
    mla = [
        linear_model.LogisticRegressionCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(dual=False),
        neighbors.KNeighborsClassifier(),
        gaussian_process.GaussianProcessClassifier(),
        naive_bayes.GaussianNB(),
        naive_bayes.BernoulliNB(),
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.RandomForestClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.AdaBoostClassifier(),
        ensemble.GradientBoostingClassifier(),
    ]

    mla_compare = test_models(mla, train_df, train_df_x_bin, target)

    best_estimator = optimize_params(mla, mla_compare, train_df,
                                     train_df_x_bin, target)

    generate_submission_csv(test_df, train_df_x_bin, best_estimator)