def instanciate_estimators(clf_type, y=None, **kw): if clf_type in ['binary-clf']: print(('Fraction by class: True: %0.2f; False: %0.2f' % (list(y).count(True) / len(y), list(y).count(False) / len(y)))) cw = 'balanced' clfs = [ # linear_model.LogisticRegressionCV( # class_weight=cw, max_iter=100, # penalty='l2', n_jobs=1), linear_model.RidgeClassifierCV( class_weight=cw, cv=3), ensemble.GradientBoostingClassifier( n_estimators=100), # ensemble.RandomForestClassifier( # n_estimators=100, class_weight=cw) # neural_network.MLPClassifier( # hidden_layer_sizes=(100,)), # NNetBinaryClassifier(**kw) # waiting for data preprocessing to get configs ] elif clf_type in ['multiclass-clf']: print('fraction of the most frequent class:', max([list(y).count(x) for x in set(list(y))]) / len(list(y))) clfs = [ # linear_model.LogisticRegressionCV( # max_iter=100, penalty='l2', n_jobs=1), linear_model.RidgeClassifierCV(cv=3), ensemble.GradientBoostingClassifier( n_estimators=100), # ensemble.RandomForestClassifier( # n_estimators=100), # neural_network.MLPClassifier(hidden_layer_sizes=(100,)), # NNetMultiClassifier(**kw) ] elif clf_type in ['regression']: clfs = [ linear_model.RidgeCV(cv=3), ensemble.GradientBoostingRegressor( n_estimators=100), ensemble.RandomForestRegressor( n_estimators=100) # neural_network.MLPRegressor(hidden_layer_sizes=(100,)) # NNetRegressor(**kw) # waiting for data preprocessing to get configs ] else: raise ValueError("{} not recognized".format(clf_type)) return clfs
def get_algorithms(): MLA_dict = { # Ensemble methods "ada": ensemble.AdaBoostClassifier(), "bc": ensemble.BaggingClassifier(), "etc": ensemble.ExtraTreesClassifier(), "gbc": ensemble.GradientBoostingClassifier(), "rfc": ensemble.RandomForestClassifier(), # Gaussian processes "gpc": gaussian_process.GaussianProcessClassifier(), # Linear models "lr": linear_model.LogisticRegressionCV(), "pac": linear_model.PassiveAggressiveClassifier(), "rcc": linear_model.RidgeClassifierCV(), "sgd": linear_model.SGDClassifier(), "per": linear_model.Perceptron(), # Navies bayes "bnb": naive_bayes.BernoulliNB(), "gnb": naive_bayes.GaussianNB(), # Nearest neighbour "knn": neighbors.KNeighborsClassifier(), # SVM "svc": svm.SVC(probability=True), "nvc": svm.NuSVC(probability=True), "lvc": svm.LinearSVC(), # Trees "dtc": tree.DecisionTreeClassifier(), "ets": tree.ExtraTreeClassifier(), # Discriminant analysis "lda": discriminant_analysis.LinearDiscriminantAnalysis(), "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(), } return MLA_dict
def test(sdata, classifier=None, verbose=True, verboseverbose=False): digits = sdata X_digits = digits.data y_digits = digits.target n_samples = len(X_digits) # data X_train = X_digits[:.85 * n_samples] y_train = y_digits[:.85 * n_samples] # truths/target X_test = X_digits[.85 * n_samples:] y_test = y_digits[.85 * n_samples:] if not classifier: classifier = linear_model.RidgeClassifierCV() classifier_fit = classifier.fit(X_train, y_train) pred = classifier_fit.predict(X_test) score = classifier_fit.score(X_test, y_test) if verboseverbose: # print the matrix of feature scores big_matrix = np.array([ np.hstack((X_test[i], y_test[i])) for i in range(len(X_test)) ]) print(['Tr0Rhyt','Tr0TopL','Tr1Rhyt','Tr1TopL','Truth']) print(big_matrix) if verbose: print('TRUTH:', y_test) print('PREDN:', pred) print('Classifier score: %f' % score) return score, pred, y_test
def test_sk_RidgeClassifierCV(): print("Testing sklearn, RidgeClassifierCV...") mod = linear_model.RidgeClassifierCV() X, y = iris_data mod.fit(X, y) docs = {'name': "RidgeClassifierCV test"} fv = X[0, :] upload(mod, fv, docs)
def ridge_learn(scale_param, dim, depth, data, labels, CV=None, reg=[np.array((0.1, 1, 10))], cpu_number=1): """ scale_param is a float, dim and depth are positive integers, data is a list of numpy arrays with each array having the shape of an esig stream2sig output for a stream of dimension dim truncated to level depth, labels is a list, same length as data list, of integers, CV determines the cross-validation splitting strategy of a sklearn GridSearchCV and can be any of the allowed options for this (deault is None), reg is a numpy array of floats (its default is numpy.array((0.1,1.0,10.0))), and cpu_number is an integer (its default value is 1). The entries in the data list are scaled via the sig_scale_depth_ratio function, i.e. via sig_scale_depth_ratio(data, dim, depth, scalefactor=scale_param), and cpu_number number of cpus are used for parallelisation. Once scaled, a sklearn GridSearchCV is run with the model set to be RidgeClassifierCV(), the param_grid to be {'alphas':reg} and the cross-validation strategy to be determined by CV. The selected best model is used to predict the labels for the appropriately scaled data, and the accuracy_score of the predicted labels compared to the actual labels is computed. The returned output is a list composed of the scale_param used, the model selected during the GridSearch, and the accuracy_score achieved by the selected model. """ if depth == 0: return print( "Error: Depth 0 term of signature is always 1 and will not change under scaling" ) if dim == 1: return print("Error: One-dimensionl signatures are trivial") else: ridge = linear_model.RidgeClassifierCV() tuned_params = {'alphas': reg} Q = Parallel(n_jobs=cpu_number)([ delayed(sig_scale_depth_ratio)(data[k], dim, depth, scale_param) for k in range(len(data)) ]) model = GridSearchCV(estimator=ridge, param_grid=tuned_params, cv=CV, n_jobs=cpu_number) model.fit(Q, labels) best_model = model.best_estimator_ preds = best_model.predict(Q) acc = accuracy_score(preds, labels) return scale_param, best_model, acc
def train(volt_values, target_output, split_ratio=0.2): """ function to train a simple linear regression to fit the snapshot of membrane potential (state matrix) to binary classification using a ridge regression with cross-validation for regularization parameter. :param volt_values: np.arr, shape: len.of stim. presentation x N_E. snapshots of membrane potential at each stimuli offset. :param target_output: np.arr, shape: num. of stimuli x len. of stim. presentation. @sym_seq in the main.py :param split_ratio: float, percentage of the data to be used for the test :return: list, saves the score for each module """ scores = np.zeros( module_depth) # array to save accuracy score for each module MSE = np.zeros(module_depth) for mod_i in range(module_depth): # split the data into training and test sets # x_train dim: #train_sample(#screenshots) x #features(#neurons) # y_train dim: #train_sample * #classes(stimuli) print(np.transpose(np.int_(target_output)).shape) x_train, x_test, y_train, y_test = train_test_split( np.transpose(volt_values[mod_i, :]), # for each module np.transpose(np.int_(target_output)), test_size=split_ratio, ) # linear ridge regression with cross-validation for regularization parameter # deltas = [0.01, 0.1, 1, 10, 100] # regularization parameter deltas = [1e0, 1e3, 1e4, 2e4, 5e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10] fit_model = lm.RidgeClassifierCV(alphas=deltas, fit_intercept=True, store_cv_values=True).fit(X=x_train, y=y_train) # use the trained weight to predict the class of @y_test. Use WTA operation, without giving confidence level # predicted dim: 1 x #test sample. Each element consists indices of predicted class. predicted = fit_model.predict( x_test ) # dim: sample num x 1. Each entry indicates that n-th class is predicted. sum = 0 # count how many samples of y_test gets classified correctly for sample_index, class_predicted in enumerate(predicted): sum += y_test[sample_index, class_predicted] # entry of y_test are 0 and 1 scores[mod_i] = ( sum / y_test.shape[0] ) # normalize to 1 and save the accuracy for each module # print("weights: ", fit_model.coef_[:4, :10]) # print("intercepts: ", fit_model.intercept_[:10]) print("reg.params.: ", fit_model.alpha_) # shit doesn't work # MSE deltaindex = np.where(deltas == fit_model.alpha_)[ 0] # pick delta which is actually chosen MSE[mod_i] = np.mean(fit_model.cv_values_[:, :, deltaindex], axis=(0, 1, 2)) # average over all samples & feats. return scores, MSE
def test_model_ridge_classifier_cv_bool(self): model, X = fit_classification_model( linear_model.RidgeClassifierCV(), 2, is_bool=True) model_onnx = convert_sklearn( model, "binary ridge classifier cv", [("input", BooleanTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnRidgeClassifierCVBool")
def PrepareModel(self, savedmodel = None): if savedmodel != None: self.clf = savedmodel else: if self.mlmethod==Constants.MACHINE_LEARNING_METHOD_REGRESSION: self.clf=linear_model.RidgeCV(alphas=self.alphas) elif self.mlmethod==Constants.MACHINE_LEARNING_METHOD_CLASSIFICATION: self.clf=linear_model.RidgeClassifierCV(alphas=self.alphas) self.clf.fit(self.traindata ,self.trainlabel)
def test_model_ridge_classifier_cv_multi_class(self): model, X = fit_classification_model( linear_model.RidgeClassifierCV(), 5) model_onnx = convert_sklearn( model, "multi-class ridge classifier cv", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnRidgeClassifierCVMulti")
def test_model_ridge_classifier_cv_multilabel(self): model, X_test = fit_multilabel_classification_model( linear_model.RidgeClassifierCV(random_state=42)) model_onnx = convert_sklearn( model, "scikit-learn RidgeClassifierCV", [("input", FloatTensorType([None, X_test.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X_test, model, model_onnx, basename="SklearnRidgeClassifierCVMultiLabel")
def cross_validated_estimators_tests(): models = [ linear_model.ElasticNetCV(), linear_model.LarsCV(), linear_model.LassoCV(), linear_model.LassoLarsCV(), linear_model.LogisticRegressionCV(), linear_model.OrthogonalMatchingPursuitCV(), linear_model.RidgeClassifierCV(), linear_model.RidgeCV() ] for model in models: cross_validated_estimators(model)
def test_classification_bootstrap(self): ridge_class = linear_model.RidgeClassifier() ridge_class_cv = linear_model.RidgeClassifierCV() result = bootstrap.regression_bootstrap( data=self.data, target=self.class_target, regressor=ridge_class, regressor_cv=ridge_class_cv, verbose=False, bootstraps=5) self.assertIsInstance(result, pd.DataFrame) self.assertEqual(result.shape[1], self.data.shape[1]+1)
def test_model_ridge_classifier_cv_bool(self): model, X = fit_classification_model( linear_model.RidgeClassifierCV(), 2, is_bool=True) model_onnx = convert_sklearn( model, "binary ridge classifier cv", [("input", BooleanTensorType([None, X.shape[1]]))], ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnRidgeClassifierCVBool", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_ridge_classifier_cv_multi_class(self): model, X = fit_classification_model(linear_model.RidgeClassifierCV(), 5) model_onnx = convert_sklearn( model, "multi-class ridge classifier cv", [("input", FloatTensorType([None, X.shape[1]]))], ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnRidgeClassifierCVMulti", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_ridge_classifier_cv_multilabel(self): model, X_test = fit_multilabel_classification_model( linear_model.RidgeClassifierCV(random_state=42)) model_onnx = convert_sklearn( model, "scikit-learn RidgeClassifierCV", [("input", FloatTensorType([None, X_test.shape[1]]))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( X_test, model, model_onnx, basename="SklearnRidgeClassifierCVMultiLabel", allow_failure="StrictVersion(" "onnxruntime.__version__)<= StrictVersion('0.2.1')", )
def sklearn_liner_model_regressions(xTrain, xTest, yTrain, yTest): modelForConsideration: DataFrame = pd.DataFrame() LinerModels = \ [ linear_model.ARDRegression(), linear_model.BayesianRidge(), linear_model.ElasticNet(), linear_model.ElasticNetCV(), linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LarsCV(), linear_model.Lasso(), linear_model.LassoCV(), linear_model.LassoLars(), linear_model.LassoLarsCV(), linear_model.LassoLarsIC(), linear_model.LinearRegression(), linear_model.MultiTaskLasso(), linear_model.MultiTaskElasticNet(), linear_model.MultiTaskLassoCV(), linear_model.MultiTaskElasticNetCV(), linear_model.OrthogonalMatchingPursuit(), linear_model.OrthogonalMatchingPursuitCV(), linear_model.PassiveAggressiveClassifier(), linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(), linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.RidgeClassifier(), linear_model.RidgeClassifierCV(), linear_model.RidgeCV(), linear_model.SGDClassifier(), linear_model.SGDRegressor(), linear_model.TheilSenRegressor(), linear_model.enet_path(xTrain, yTrain), linear_model.lars_path(xTrain, yTrain), linear_model.lasso_path(xTrain, yTrain), # linear_model.LogisticRegression() # ,linear_model.LogisticRegressionCV(),linear_model.logistic_regression_path(xTrain, yTrain), linear_model.orthogonal_mp(xTrain, yTrain), linear_model.orthogonal_mp_gram(), linear_model.ridge_regression() ] for model in LinerModels: modelName: str = model.__class__.__name__ try: # print(f"Preparing Model {modelName}") if modelName == "LogisticRegression": model = linear_model.LogisticRegression(random_state=0) model.fit(xTrain, yTrain) yTrainPredict = model.predict(xTrain) yTestPredict = model.predict(xTest) errorList = calculate_prediction_error(modelName, yTestPredict, yTest, yTrainPredict, yTrain) if errorList["Test Average Error"][0] < 30 and errorList[ "Train Average Error"][0] < 30: try: modelForConsideration = modelForConsideration.append( errorList) except (Exception) as e: print(e) except (Exception, ArithmeticError) as e: print(f"Error occurred while preparing Model {modelName}") return modelForConsideration
def ModelSelection(test_data, features, label): MLA = [ ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), gaussian_process.GaussianProcessClassifier(), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ] MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score'] MLA_compare = pd.DataFrame(columns=MLA_columns) x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[label], test_size=0.2) row_index = 0 MLA_predict = train_data[label] for alg in MLA: MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) alg.fit(x_train, y_train) MLA_predict[MLA_name] = alg.predict(x_test) MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test) row_index += 1 MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True) return MLA_compare, x_train, x_test, y_train, y_test
def all_classifiers(): # Model Data MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] return MLA
#Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes #gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.LogisticRegression(C=1000, random_state=0, solver='liblinear'), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), #naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(),
).fit(self.tr_data, self.tr_label) return True #TODO: 其他模型没有调参 def train_with_LassoCV(self): if self.tr_data == None or self.tr_label = None: print ("lack of train data or train label") return False self.model = linear_model.LassoCV().fit(self.tr_data, self.tr_label) return True def train_with_RidgeCV(self): if self.tr_data == None or self.tr_label = None: print ("lack of train data or train label") return False self.model = linear_model.RidgeClassifierCV().fit(self.tr_data, self.tr_label) return True def train_with_ElasticNetCV(self): if self.tr_data == None or self.tr_label = None: print ("lack of train data or train label") return False self.model = linear_model.MultiTaskElasticNetCV().fit(self.tr_data, self.tr_label) return True def set_default_params(self): self.params = { 'penalty': 'l2', 'C': 1.0, 'solver':'lbfgs' }
# validate_score_clf(linear_model.PassiveAggressiveClassifier(max_iter=3000), 'linear_model.PassiveAggressiveClassifier-2000') validate_score_clf( linear_model.PassiveAggressiveClassifier(max_iter=5000, early_stopping=True), 'linear_model.PassiveAggressiveClassifier-earlyStopping') validate_score_clf(linear_model.SGDClassifier(max_iter=800), 'linear_model.SGDClassifier800') # validate_score_clf(linear_model.SGDClassifier(max_iter=1200), 'linear_model.SGDClassifier1200') # validate_score_clf(linear_model.SGDClassifier(max_iter=3200), 'linear_model.SGDClassifier3200') # # # validate_score_clf(linear_model.LarsCV(max_iter=1200), 'linear_model.LarsCV') # # # validate_score_clf(linear_model.LassoLarsCV(max_iter=1200), 'linear_model.LassoLarsCV') # # # validate_score_clf(linear_model.LassoCV(max_iter=1200), 'linear_model.LassoCV') # # # validate_score_clf(linear_model.ElasticNetCV(max_iter=1200), 'linear_model.ElasticNetCV') # validate_score_clf(linear_model.OrthogonalMatchingPursuitCV(), 'linear_model.OrthogonalMatchingPursuitCV') # # validate_score_clf(ensemble.GradientBoostingClassifier(n_estimators=15, verbose=1), 'GradientBoostingClassifier') validate_score_clf(linear_model.RidgeClassifierCV(class_weight='balanced'), 'linear_model.RidgeClassifierCV-balanced') # validate_score_clf(linear_model.RidgeClassifierCV(), 'linear_model.RidgeClassifierCV') validate_score_clf(ensemble.RandomForestClassifier(n_estimators=200), 'RandomForestClassifier') # validate_score_clf(linear_model.LogisticRegressionCV(max_iter=550, Cs=np.geomspace(1e-1, 1e-7, 15), class_weight='balanced'), 'LogisticRegressionCV_maxiter550') # validate_score_clf(linear_model.LogisticRegressionCV(max_iter=900, Cs=np.geomspace(1e-1, 1e-7, 15), class_weight='balanced'), 'LogisticRegressionCV_maxiter900') validate_score_clf( linear_model.LogisticRegressionCV(max_iter=1000, Cs=np.geomspace(1e-1, 1e-7, 15)), 'LogisticRegressionCV_imbalanced') # clf = linear_model.LogisticRegressionCV(
def model_comparison(x, y, show=True): """ Copy from : https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy/notebook Compare with various machine learning model """ from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process from xgboost import XGBClassifier MLA = [ #Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), #Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit #note: this is an alternative to train_test_split cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.7, random_state=0) # run model #create table to compare MLA metrics MLA_columns = [ 'MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD', 'MLA Time' ] MLA_compare = pd.DataFrame(columns=MLA_columns) #create table to compare MLA predictions MLA_predict = y.copy() #index through MLA and save performance to table row_index = 0 for alg in MLA: #set name and parameters MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) #score model with cross validation: cv_results = model_selection.cross_validate(alg, x, y, cv=cv_split) MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean() MLA_compare.loc[ row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean() MLA_compare.loc[ row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean() #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results[ 'test_score'].std() * 3 #let's know the worst that can happen! #save MLA predictions - see section 6 for usage alg.fit(x, y) MLA_predict[MLA_name] = alg.predict(x) row_index += 1 MLA_compare.sort_values(by=['MLA Test Accuracy Mean'], ascending=False, inplace=True) if show: plt.figure(figsize=(15, 6)) sns.barplot(x='MLA Test Accuracy Mean', y='MLA Name', data=MLA_compare, color='m') plt.show() return MLA_compare
regression(linear_model.OrthogonalMatchingPursuitCV()), regression(linear_model.Ridge(random_state=RANDOM_SEED)), regression(linear_model.RidgeCV()), regression(linear_model.BayesianRidge()), regression(linear_model.ARDRegression()), regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)), regression( linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)), # Logistic Regression classification( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification(linear_model.RidgeClassifierCV()), classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary( linear_model.SGDClassifier(random_state=RANDOM_SEED)), # Decision trees regression(tree.DecisionTreeRegressor(**TREE_PARAMS)), regression(tree.ExtraTreeRegressor(**TREE_PARAMS)), classification(tree.DecisionTreeClassifier(**TREE_PARAMS)),
CLF = [ #Ensemble Methods ('ada', ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier())), ('bc', ensemble.BaggingClassifier()), ('etc', ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('xgbc', xgb.XGBClassifier(max_depth=3)), # xgb.XGBClassifier()), # ('rfc', ensemble.RandomForestClassifier(n_estimators=50)), #Gaussian Processes ('gpc', gaussian_process.GaussianProcessClassifier()), #GLM - remove linear models, since this is a classifier algorithm ('lr', linear_model.LogisticRegressionCV()), ('pac', linear_model.PassiveAggressiveClassifier()), ('rc', linear_model.RidgeClassifierCV()), ('sgd', linear_model.SGDClassifier()), ('pct', linear_model.Perceptron()), #Navies Bayes ('gnb', naive_bayes.GaussianNB()), #Nearest Neighbor ('knn', neighbors.KNeighborsClassifier(n_neighbors=3)), #SVM ('svc', svm.SVC(probability=True)), ('lsvc', svm.LinearSVC()), #Trees ('dtc', tree.DecisionTreeClassifier()),
def compare_algorithm(data, target): x_train, x_cross, y_train, y_cross = train_test_split(data, target) MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(max_iter=1000, tol=0.001), linear_model.Perceptron(max_iter=1000, tol=0.001), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html xgb.XGBClassifier() ] MLA_columns = [] MLA_compare = pd.DataFrame(columns=MLA_columns) row_index = 0 for alg in MLA: predicted = alg.fit(x_train, y_train).predict(x_cross) fp, tp, th = roc_curve(y_cross, predicted) MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round( alg.score(x_train, y_train), 4) MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round( alg.score(x_cross, y_cross), 4) MLA_compare.loc[row_index, 'MLA Precission'] = precision_score( y_cross, predicted) MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(y_cross, predicted) MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp) row_index = row_index + 1 MLA_compare.sort_values(by=['MLA Test Accuracy'], ascending=False, inplace=True) print(MLA_compare)
.format(movement_required)) print('Decisions: {}'.format(list(decisions))) ###### Tree clf_DTC = tree.DecisionTreeClassifier() clf_DTC = clf_DTC.fit(x_train, y_scikit_train) clf_ETC = tree.ExtraTreeClassifier() clf_ETC = clf_ETC.fit(x_train, y_scikit_train) ###### Neighbhors clf_KNC = neighbors.KNeighborsClassifier() clf_KNC = clf_KNC.fit(x_train, y_scikit_train) ###### linear Model clf_RCCV = linear_model.RidgeClassifierCV() clf_RCCV = clf_RCCV.fit(x_train, y_scikit_train) ###### Ensemble clf_RFC = ensemble.RandomForestClassifier() clf_RFC = clf_RFC.fit(x_train, y_scikit_train) clf_ETC_ens = ensemble.ExtraTreesClassifier() clf_ETC_ens = clf_ETC_ens.fit(x_train, y_scikit_train) clf_ABC = ensemble.AdaBoostClassifier() clf_ABC = clf_ABC.fit(x_train, y_scikit_train) clf_GBC = ensemble.GradientBoostingClassifier() clf_GBC = clf_GBC.fit(x_train, y_scikit_train)
def get_regression_estimators(r, regression_models): if r == 'ARDRegression': regression_models[r] = linear_model.ARDRegression() elif r == 'BayesianRidge': regression_models[r] = linear_model.BayesianRidge() elif r == 'ElasticNet': regression_models[r] = linear_model.ElasticNet() elif r == 'ElasticNetCV': regression_models[r] = linear_model.ElasticNetCV() elif r == 'HuberRegressor': regression_models[r] = linear_model.HuberRegressor() elif r == 'Lars': regression_models[r] = linear_model.Lars() elif r == 'LarsCV': regression_models[r] = linear_model.LarsCV() elif r == 'Lasso': regression_models[r] = linear_model.Lasso() elif r == 'LassoCV': regression_models[r] = linear_model.LassoCV() elif r == 'LassoLars': regression_models[r] = linear_model.LassoLars() elif r == 'LassoLarsCV': regression_models[r] = linear_model.LassoLarsCV() elif r == 'LassoLarsIC': regression_models[r] = linear_model.LassoLarsIC() elif r == 'LinearRegression': regression_models[r] = linear_model.LinearRegression() elif r == 'LogisticRegression': regression_models[r] = linear_model.LogisticRegression() elif r == 'LogisticRegressionCV': regression_models[r] = linear_model.LogisticRegressionCV() elif r == 'MultiTaskElasticNet': regression_models[r] = linear_model.MultiTaskElasticNet() elif r == 'MultiTaskElasticNetCV': regression_models[r] = linear_model.MultiTaskElasticNetCV() elif r == 'MultiTaskLasso': regression_models[r] = linear_model.MultiTaskLasso() elif r == 'MultiTaskLassoCV': regression_models[r] = linear_model.MultiTaskLassoCV() elif r == 'OrthogonalMatchingPursuit': regression_models[r] = linear_model.OrthogonalMatchingPursuit() elif r == 'OrthogonalMatchingPursuitCV': regression_models[r] = linear_model.OrthogonalMatchingPursuitCV() elif r == 'PassiveAggressiveClassifier': regression_models[r] = linear_model.PassiveAggressiveClassifier() elif r == 'PassiveAggressiveRegressor': regression_models[r] = linear_model.PassiveAggressiveRegressor() elif r == 'Perceptron': regression_models[r] = linear_model.Perceptron() elif r == 'RANSACRegressor': regression_models[r] = linear_model.RANSACRegressor() elif r == 'Ridge': regression_models[r] = linear_model.Ridge() elif r == 'RidgeClassifier': regression_models[r] = linear_model.RidgeClassifier() elif r == 'RidgeClassifierCV': regression_models[r] = linear_model.RidgeClassifierCV() elif r == 'RidgeCV': regression_models[r] = linear_model.RidgeCV() elif r == 'SGDClassifier': regression_models[r] = linear_model.SGDClassifier() elif r == 'SGDRegressor': regression_models[r] = linear_model.SGDRegressor() elif r == 'TheilSenRegressor': regression_models[r] = linear_model.TheilSenRegressor() else: print( r + " is an unsupported regression type. Check if you have misspelled the name." )
predicts = model.predict(test_features) print('Test result:') print(classification_report(test_labels, predicts, labels=[1])) if __name__ == "__main__": worddir = load_worddir(WORDPATH) features, labels = get_dataset() print("Numbers of train data: %d" % (len(features))) cnt = 0 for label in labels: cnt += (label == 1) print("Numbers of positive data: %d" % (cnt)) test_features, test_labels = get_dataset(train=False) print("Numbers of test data: %d" % (len(test_features))) models = [ linear_model.RidgeClassifierCV(normalize=True), linear_model.LogisticRegressionCV(n_jobs=-1), tree.DecisionTreeClassifier(criterion='entropy'), ensemble.RandomForestClassifier(n_jobs=-1), svm.SVC(kernel='rbf') ] for model in models: print("start", model) cross_validation(model, features, labels) model.fit(features, labels) test_score(model, test_features, test_labels) print("end", model)
f2 = open('test_set', 'r') J = f2.readlines() for j in range(0, len(J)): J[j] = J[j].rstrip('\n') f2.close() features_train = [] labels_train = [] for filename in I: file = open(directory + filename, 'r') examples = get_features_labels(get_examples(file)) for ex in examples: features_train.append(ex[0]) labels_train.append(ex[1]) classifiers = [ linear_model.RidgeClassifierCV( normalize=True), # Linear Regression (Ridge regression) linear_model.LogisticRegressionCV(), # Logistic Regression tree.DecisionTreeClassifier(criterion='entropy'), # Decision Tree ensemble.RandomForestClassifier(), # Random Forest svm.SVC(kernel='rbf') # SVM ] for clf in classifiers: clf.fit(features_train, labels_train) cross_validation(clf, features_train, labels_train) test_score(clf, J)
def main(): train_df = pd.read_csv("train.csv") test_df = pd.read_csv("test.csv") combine = [train_df, test_df] for df in combine: df.info() standardize_data(df) create_columns(df) create_bins(df) encode_data(df) # Define target (Y variable) target = ["Survived"] # Define features (X variables) train_df_x = [ "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone", "Title", ] # Define numerical features (binned and encoded) train_df_x_bin = [ "Pclass", "Sex_Code", "AgeBin_Code", "FareBin_Code", "Embarked_Code", "FamilySize", "IsAlone", "Title_Code", ] # Analyze feature correlation with target for x in train_df_x: if train_df[x].dtype != "float64": print(train_df[[x, target[0]]].groupby(x).mean()) # Graph individual features by survival fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.histplot(x="Fare", data=train_df, hue="Survived", multiple="stack", ax=axis[0]) sns.histplot(x="Age", data=train_df, hue="Survived", multiple="stack", ax=axis[1]) sns.histplot(x="FamilySize", data=train_df, hue="Survived", multiple="stack", ax=axis[2]) fig, axis = plt.subplots(2, 3, figsize=(16, 12)) sns.barplot(x="Pclass", y="Survived", data=train_df, ax=axis[0, 0]) sns.barplot(x="Sex", y="Survived", data=train_df, ax=axis[0, 1]) sns.barplot(x="Embarked", y="Survived", data=train_df, ax=axis[0, 2]) sns.barplot(x="IsAlone", y="Survived", data=train_df, ax=axis[1, 0]) sns.barplot(x="Title", y="Survived", data=train_df, ax=axis[1, 1]) # Compare class with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Sex", ax=axis[0]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Compare Sex with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Pclass", ax=axis[0]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Correlation heatmap of dataset fig, ax = plt.subplots(figsize=(14, 12)) fig = sns.heatmap( train_df.corr(), cmap=sns.diverging_palette(240, 10, as_cmap=True), annot=True, ax=ax, ) # Machine Learning Algorithm (MLA) selection and initialization mla = [ linear_model.LogisticRegressionCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(dual=False), neighbors.KNeighborsClassifier(), gaussian_process.GaussianProcessClassifier(), naive_bayes.GaussianNB(), naive_bayes.BernoulliNB(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier(), ensemble.ExtraTreesClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), ] mla_compare = test_models(mla, train_df, train_df_x_bin, target) best_estimator = optimize_params(mla, mla_compare, train_df, train_df_x_bin, target) generate_submission_csv(test_df, train_df_x_bin, best_estimator)