from sklearn.model_selection import train_test_split from sklearn.linear_model import RidgeClassifierCV from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.metrics import accuracy_score if __name__ == '__main__': data = pd.read_csv('wine.data', header=None) x, y = data.iloc[:, 1:], data[0] x = MinMaxScaler().fit_transform(x) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.7) lr = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), cv=3) lr.fit(x_train, y_train.ravel()) print(u'参数alpha=%.2f' % lr.alpha_) y_train_pred = lr.predict(x_train) y_test_pred = lr.predict(x_test) print(u'Logistic回归训练集准确率:', accuracy_score(y_train, y_train_pred)) print(u'Logistic回归测试集准确率:', accuracy_score(y_test, y_test_pred)) rf = RandomForestClassifier(n_estimators=100, max_depth=8, min_samples_split=5, oob_score=True) rf.fit(x_train, y_train.ravel()) print(u'OOB Score=%.5f' % rf.oob_score_) y_train_pred = rf.predict(x_train) y_test_pred = rf.predict(x_test)
assert flt_features == (all_features - {f}) return True return False @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')], [LogisticRegression(random_state=42, fit_intercept=False)], [LogisticRegressionCV(random_state=42)], [SGDClassifier(**SGD_KWARGS)], [SGDClassifier(loss='log', **SGD_KWARGS)], [PassiveAggressiveClassifier(random_state=42)], [Perceptron(random_state=42)], [RidgeClassifier(random_state=42)], [RidgeClassifierCV()], [LinearSVC(random_state=42)], [OneVsRestClassifier(LogisticRegression(random_state=42))], ]) def test_explain_linear(newsgroups_train, clf): assert_multiclass_linear_classifier_explained(newsgroups_train, clf, explain_prediction) if isinstance(clf, OneVsRestClassifier): assert_multiclass_linear_classifier_explained( newsgroups_train, clf, explain_prediction_sklearn) @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], [LogisticRegressionCV(random_state=42)], [OneVsRestClassifier(LogisticRegression(random_state=42))],
# These are default parameters, # we initialize an instance here just to suppress warnings LogisticCV = LogisticRegressionCV(solver='lbfgs', multi_class='auto', n_jobs=-1, max_iter=200, Cs=(0.0001, 0.001, 0.01, 0.1), cv=5, class_weight='balanced') Logistic = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1, max_iter=200, class_weight='balanced') Ridge = RidgeClassifierCV(alphas=(0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0)) LDA = LinearDiscriminantAnalysis() QDA = QuadraticDiscriminantAnalysis(reg_param=0.0001) RBF = SVC(kernel='rbf', gamma='scale', C=0.1, class_weight='balanced', cache_size=1000, probability=True) SVM_Poly2 = SVC(kernel='poly', degree=2, gamma='scale', C=0.1, class_weight='balanced', cache_size=1000, probability=True)
def build_classifiers(exclude, scale, feature_selection, nCols): ''' Input: - exclude: list of names of classifiers to exclude from the analysis - scale: True or False. Scale data before fitting classifier - feature_selection: True or False. Run feature selection before fitting classifier - nCols: Number of columns in input dataset to classifiers Output: Dictionary with classifier name as keys. - 'clf': Classifier object - 'parameters': Dictionary with parameters of 'clf' as keys ''' classifiers = collections.OrderedDict() '''Neural Networks''' if 'Multilayer Perceptron' not in exclude: classifiers['Multilayer Perceptron'] = { 'clf': MLPClassifier(), 'parameters': {'hidden_layer_sizes': [(100, 50), (50, 25)], 'max_iter': [500]} } '''Neighbor Methods''' if 'Nearest Neighbors' not in exclude: classifiers['Nearest Neighbors'] = { 'clf': KNeighborsClassifier(), 'parameters': {'n_neighbors': [1, 5, 10, 20]}} if 'Radius Neighbors' not in exclude: classifiers['Radius Neighbors'] = { 'clf': RadiusNeighborsClassifier(outlier_label=0), 'parameters': {}} '''SVM''' if 'SVM' not in exclude: classifiers['SVM'] = { 'clf': SVC(C=1, probability=True, cache_size=10000, class_weight='balanced'), 'parameters': {'kernel': ['rbf', 'poly'], 'C': [0.01, 0.1, 1]}} if 'Linear SVM' not in exclude: classifiers['Linear SVM'] = { 'clf': LinearSVC(dual=False, class_weight='balanced'), 'parameters': {'C': [0.01, 0.1, 1], 'penalty': ['l1', 'l2']}} '''Tree Methods''' if 'Decision Tree' not in exclude: classifiers['Decision Tree'] = { 'clf': DecisionTreeClassifier(max_depth=None, max_features='auto'), 'parameters': {}} if 'Random Forest' not in exclude: classifiers['Random Forest'] = { 'clf': RandomForestClassifier(max_depth=None, n_estimators=10, max_features='auto'), 'parameters': {'n_estimators': list(range(5, 20))}} '''Ensemble Methods''' if 'Ada Boost' not in exclude: classifiers['Ada Boost'] = { 'clf': AdaBoostClassifier(), 'parameters': {}} if 'Bagging' not in exclude: classifiers['Ada Boost'] = { 'clf': BaggingClassifier(), 'parameters': {}} if 'Gradient Boost' not in exclude: classifiers['Gradient Boost'] = { 'clf': GradientBoostingClassifier(), 'parameters': {}} ''' Linear Models ''' if 'Logistic Regression' not in exclude: classifiers['Logistic Regression'] = { 'clf': LogisticRegression(fit_intercept=True, solver='lbfgs', penalty='l2'), 'parameters': {'C': [0.001, 0.1, 1]}} if 'Ridge Classification' not in exclude: classifiers['Ridge Classification'] = { 'clf': RidgeClassifier(fit_intercept=True), 'parameters': {}} if 'Ridge Classification CV' not in exclude: classifiers['Ridge Classification CV'] = { 'clf': RidgeClassifierCV(fit_intercept=True), 'parameters': {}} if 'Passive Aggressive' not in exclude: classifiers['Passive Aggressive Classifier'] = { 'clf': PassiveAggressiveClassifier(), 'parameters': {}} if 'Perceptron' not in exclude: classifiers['Perceptron'] = { 'clf': Perceptron(), 'parameters': {}} '''Naive Bayes''' if 'Gaussian Naive Bayes' not in exclude: classifiers['Gaussian Naive Bayes'] = { 'clf': GaussianNB(), 'parameters': {}} if 'Bernoulli Naive Bayes' not in exclude: classifiers['Bernoulli Naive Bayes'] = { 'clf': BernoulliNB(), 'parameters': {}} '''Discriminant Analysis''' if 'LDA' not in exclude: classifiers['LDA'] = { 'clf': LinearDiscriminantAnalysis(), 'parameters': {}} if 'QDA' not in exclude: classifiers['QDA'] = { 'clf': QuadraticDiscriminantAnalysis(), 'parameters': {}} if 'Gaussian Process' not in exclude: classifiers['Guassian Process'] = { 'clf': GaussianProcessClassifier(), 'parameters': {}} # classifiers['Voting'] = {} def name(x): """ :param x: The name of the classifier :return: The class of the final estimator in lower case form """ return x['clf']._final_estimator.__class__.__name__.lower() for key, val in classifiers.items(): if not scale and not feature_selection: break steps = [] if scale: steps.append(StandardScaler()) if feature_selection: steps.append(SelectKBest(f_regression, k='all')) steps.append(classifiers[key]['clf']) classifiers[key]['clf'] = make_pipeline(*steps) # Reorganize paramenter list for grid search new_dict = {} for keyp in classifiers[key]['parameters']: new_dict[name(classifiers[key]) + '__' + keyp] = classifiers[key]['parameters'][keyp] classifiers[key]['parameters'] = new_dict if nCols > 5 and feature_selection: classifiers[key]['parameters']['selectkbest__k'] = np.linspace( np.round(nCols / 5), nCols, 5).astype('int').tolist() return classifiers
y_pred = classifier.predict(X_test_mean) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_median,y_train) y_pred = classifier.predict(X_test_median) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_mode,y_train) y_pred = classifier.predict(X_test_mode) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) from sklearn.linear_model import RidgeClassifierCV classifier = RidgeClassifierCV() classifier.fit(X_train_0,y_train) y_pred = classifier.predict(X_test_0) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_mean,y_train) y_pred = classifier.predict(X_test_mean) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred)) classifier.fit(X_train_median,y_train) y_pred = classifier.predict(X_test_median) y_pred = np.round(y_pred).flatten() print(accuracy_score(y_test, y_pred))
verbose=True), "shapelet": make_pipeline(TruncationTransformer(lower=1000), ContractedShapeletTransform( time_contract_in_mins=10, num_candidates_to_sample_per_case=10, verbose=2, random_state=1), RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=1), verbose=True), "rocket": make_pipeline(TruncationTransformer(lower=MAX_LENGTH), Rocket(random_state=1), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), verbose=True), "mr-seql": make_pipeline(TruncationTransformer(lower=MAX_LENGTH), MrSEQLClassifier(symrep=['sax', 'sfa']), verbose=True), "full_features": make_pipeline( TruncationTransformer(lower=MAX_LENGTH), ColumnEnsembleClassifier([ ("features_0", make_pipeline(TSFreshFeatureExtractor( default_fc_parameters="efficient", show_warnings=False, n_jobs=-1), RandomForestClassifier(n_jobs=-1, random_state=1),
class PipelineComponents: """Key-value pairs used by the ExtendedPipeline class to determine which components to use in the pipeline""" models = { 'XGBClassifier': XGBClassifier(), 'ElasticNetClassifier': SGDClassifier(penalty="elasticnet", l1_ratio=0.5, loss='log', tol=0.0001), 'RidgeClassifierCV': RidgeClassifierCV(), 'Perceptron': Perceptron(max_iter=2500, penalty='l2'), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=50), 'RandomForestClassifier': RandomForestClassifier(n_estimators=1000), 'LinearSVC': LinearSVC(dual=False, penalty='l2', tol=1e-3), 'SGDClassifier': SGDClassifier(alpha=.0001, penalty='l2'), 'SGDClassifier_elasticnet': SGDClassifier(alpha=.0001, penalty="elasticnet"), 'NearestCentroid': NearestCentroid(), 'MultinomialNB': MultinomialNB(alpha=.01), 'BernoulliNB': BernoulliNB(alpha=.01), 'ComplementNB': ComplementNB(alpha=.1), 'SVC': SVC(), 'LogisticRegression': LogisticRegression(solver='lbfgs', max_iter=5000, penalty='l2'), 'LogisticRegressionCV': LogisticRegressionCV(max_iter=5000, n_jobs=-1) } vectorizers = { 'hashing': HashingVectorizer(tokenizer=dummy, preprocessor=dummy), 'count': CountVectorizer(min_df=5, tokenizer=dummy, preprocessor=dummy, max_df=0.5, ngram_range=(1, 2), max_features=1000), #'dummy':GloveTokenize() } stemmers = { 'porter': StemTokenizer(), 'snowball': SnowballTokenizer(), 'lemma': LemmaTokenizer(), # interchangable "No Stemmer": dummy, None: dummy } transformers = { 'tfidf': TfidfTransformer(norm='l2', use_idf=True), 'minmax': MinMaxScaler(), 'normal': Normalizer(norm='l2'), 'robust': RobustScaler(), 'max': MaxAbsScaler(), None: 'passthrough', 'passthrough': 'passthrough' }
# from sklearn.neighbors import NearestCentroid from sklearn.cluster import KMeans # from sklearn.linear_model import SGDClassifier # from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import RidgeClassifierCV from sklearn.linear_model import LassoCV # from sklearn.svm import LinearSVC # from sklearn.svm import SVC from sklearn.linear_model import SGDClassifier, LogisticRegression # from sklearn.linear_model import Perceptron # from sklearn.linear_model import PassiveAggressiveClassifier # from sklearn.utils.extmath import density models = [ ('ridge', RidgeClassifierCV(normalize=True)), ('lasso', LassoCV()), ] parameters = { 'ridge__C': (0.3, 1, 3, 10), 'lasso__C': (1, 3), } def train_data(X, y, refit=False, test_size=0.1): print("\n[ start training: {} ]".format(datetime.now())) # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
input_df = pandas.read_csv(sys.argv[1]) labels = input_df['FEATURE_LABELS'].get_values() features = input_df.drop(columns=["FEATURE_LABELS"]).get_values() def string_to_vector(s): try: cleaned_s = re.sub("[\ ]", '', s) except: return [] return cleaned_s.split(",") labels = list(map(string_to_vector, labels)) y = MultiLabelBinarizer() clf = RidgeClassifierCV() score = clf.fit(features, y.fit_transform(labels)).score(features, y.fit_transform(labels), average="samples") f = open("ridge_clf.txt", 'w') f.write(pickle.dumps(clf)) f.close() f = open("binarizer.txt", 'w') f.write(pickle.dumps(y)) f.close()
feature_cols = ['size','pole','mean','stddev','b_mean','g_mean','r_mean','b_stddev','g_stddev','r_stddev','square','ratiowh','ratioarea','approxlen','numangle','numangle90','numangle70'] X = data[feature_cols] scaler = StandardScaler() X = scaler.fit_transform(X)# Features y = data.label # Target variable # from sklearn.model_selection import train_test_split # X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0) X_train = X y_train = y from sklearn.linear_model import RidgeClassifierCV svclassifier = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1],class_weight='balanced') model = svclassifier.fit(X_train, y_train) from sklearn import metrics # instantiate the model (using the default parameters) # fit the model with data model.fit(X_train, y_train) datatest = pd.read_csv("./feature_810_all.csv") datatest = datatest.dropna()
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, LogisticRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score import pandas as pd # list of tuples: the first element is a string, the second is an object estimators = [('LogisticRegression', LogisticRegression()),('RidgeClassifier', RidgeClassifier()), ('RidgeClassifierCV', RidgeClassifierCV()),\ ('RandomForestClassifier', RandomForestClassifier()), ('GradientBoostingClassifier', GradientBoostingClassifier())] from sklearn.model_selection import train_test_split data = pd.read_csv('HR_Data.csv') # Convert all nominal to numeric. data['sales'].replace([ 'sales', 'accounting', 'hr', 'technical', 'support', 'management', 'IT', 'product_mng', 'marketing', 'RandD' ], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace=True) data['salary'].replace(['low', 'medium', 'high'], [0, 1, 2], inplace=True) ########################################### # Train & Test Data data_X = data.copy() data_y = data_X['left'] del data_X['left'] train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.2, random_state=1234)
def set_classifier(cls, resampleId): """ Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif. This may well get superceded, it is just how e have always done it :param cls: String indicating which classifier you want :return: A classifier. """ if cls.lower() == "pf": return pf.ProximityForest(random_state=resampleId) elif cls.lower() == "pt": return pf.ProximityTree(random_state=resampleId) elif cls.lower() == "ps": return pf.ProximityStump(random_state=resampleId) elif cls.lower() == "rise": return fb.RandomIntervalSpectralForest(random_state=resampleId) elif cls.lower() == "tsf": return ib.TimeSeriesForest(random_state=resampleId) elif cls.lower() == "cif": return CanonicalIntervalForest(random_state=resampleId) elif cls.lower() == "boss": return BOSSEnsemble(random_state=resampleId) elif cls.lower() == "cboss": return ContractableBOSS(random_state=resampleId) elif cls.lower() == "tde": return TemporalDictionaryEnsemble(random_state=resampleId) elif cls.lower() == "st": return st.ShapeletTransformClassifier(time_contract_in_mins=1500) elif cls.lower() == "dtwcv": return nn.KNeighborsTimeSeriesClassifier(metric="dtwcv") elif cls.lower() == "ee" or cls.lower() == "elasticensemble": return dist.ElasticEnsemble() elif cls.lower() == "tsfcomposite": # It defaults to TSF return ensemble.TimeSeriesForestClassifier() elif cls.lower() == "risecomposite": steps = [ ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)), ( "transform", FeatureUnion([ ( "acf", make_row_transformer( FunctionTransformer(func=acf_coefs, validate=False)), ), ( "ps", make_row_transformer( FunctionTransformer(func=powerspectrum, validate=False)), ), ]), ), ("tabularise", Tabularizer()), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) return ensemble.TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100) elif cls.lower() == "rocket": rocket_pipeline = make_pipeline( Rocket(random_state=resampleId), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) return rocket_pipeline else: raise Exception("UNKNOWN CLASSIFIER")
skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 8) lr = LogisticRegression(n_jobs=-1) rf = RandomForestClassifier(n_jobs=-1) ada = AdaBoostClassifier() lgbm = LGBMClassifier() xgb = XGBClassifier(n_jobs=-1) cat = CatBoostClassifier(verbose = False) etc=ExtraTreesClassifier() gbc =GradientBoostingClassifier() nb = GaussianNB() mnb=MultinomialNB() cnb1=ComplementNB() bnb=BernoulliNB() cnb2=CategoricalNB() qda = QuadraticDiscriminantAnalysis() lda = LinearDiscriminantAnalysis() rccv = RidgeClassifierCV() rc = RidgeClassifier() pf = PolynomialFeatures(interaction_only=False, degree=1, include_bias=False,) sc = StandardScaler() classifiers = [rc, rccv,lda, cat, lgbm, xgb, ada, rf, lr, etc, gbc] for clf in classifiers: #pipe = Pipeline([('impute',si),('extract features',pf), ('scale', sc), ('classify', clf)]) #, ('extract features',pf), ('scale', sc) pipe = Pipeline([('classify',clf)])#, ('std',sc) cvs = cross_val_score(pipe, X[['a','b','c']], y, scoring='roc_auc', cv=3, n_jobs=-1) print(np.std(cvs), np.mean(cvs), clf) print()
def ridge_classifier_with_cross_validation(x_train, y_train): model = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]) model.fit(x_train, y_train) weights = model.coef_, model.intercept_ score = model.score(x_train, y_train) return weights, score
from sklearn.svm import SVR, LinearSVC try: from sklearn.metrics import check_scoring except ImportError: # for scikit-learn 0.18 and 0.19 from sklearn.metrics.scorer import check_scoring # Regression ridge = RidgeCV() svr = SVR(kernel='linear') # Classification svc = LinearSVC() logistic_l1 = LogisticRegression(penalty='l1') logistic_l2 = LogisticRegression(penalty='l2') ridge_classifier = RidgeClassifierCV() random_forest = RandomForestClassifier() regressors = {'ridge': (ridge, []), 'svr': (svr, 'C')} classifiers = {'svc': (svc, 'C'), 'logistic_l1': (logistic_l1, 'C'), 'logistic_l2': (logistic_l2, 'C'), 'ridge_classifier': (ridge_classifier, [])} # Create a test dataset rng = np.random.RandomState(0) X = rng.rand(100, 10) # Create different targets y_regression = rng.rand(100) y_classification = np.hstack([[-1] * 50, [1] * 50]) y_classification_str = np.hstack([['face'] * 50, ['house'] * 50])
train_features_reduced = pd.DataFrame(data, columns=columns) #Reduce validation features data = model.transform(val_set) val_set_reduced = pd.DataFrame(data, columns=columns) # # **Selecting classifiers** # In[ ]: import warnings warnings.filterwarnings("ignore") #InteractiveShell.ast_node_interactivity = 'last_expr' classifiers = { 'LogReg': LogisticRegression(), 'RidgeClassifier': RidgeClassifierCV(), 'KNN': KNeighborsClassifier(), 'SVC': SVC(gamma='auto'), 'GaussianNB': GaussianNB(), 'DecisionTree': DecisionTreeClassifier(), 'RandomForest': RandomForestClassifier(n_estimators=100), 'AdaBoost': AdaBoostClassifier(n_estimators=100), 'GradientBoosting': GradientBoostingClassifier(n_estimators=100), 'ExtraTrees': ExtraTreesClassifier(n_estimators=100), 'BaggingClassifier': BaggingClassifier(n_estimators=100), 'XGB': XGBClassifier(), 'LDA': LinearDiscriminantAnalysis() } scoring = { 'accuracy': make_scorer(accuracy_score),
# The logistic regression from sklearn.linear_model import LogisticRegression, RidgeClassifier, \ RidgeClassifierCV logistic = LogisticRegression(C=1., penalty="l1") logistic_50 = LogisticRegression(C=50., penalty="l1") logistic_l2 = LogisticRegression(C=1., penalty="l2") logistic_cv = GridSearchCV(LogisticRegression(C=1., penalty="l1"), param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]}, scoring='f1') logistic_l2_cv = GridSearchCV(LogisticRegression(C=1., penalty="l1"), param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]}, scoring='f1') ridge = RidgeClassifier() ridge_cv = RidgeClassifierCV() # Make a data splitting object for cross validation from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score cv = LeaveOneLabelOut(session_labels) classifiers = {'SVC': svm, 'SVC cv': svm_cv, 'log l1': logistic, 'log l1 50': logistic_50, 'log l1 cv': logistic_cv, 'log l2': logistic_l2, 'log l2 cv': logistic_l2_cv, 'ridge': ridge, 'ridge cv': ridge_cv}
X = data.loc[:, data.columns != 'Class'].values y = data.loc[:, data.columns == 'Class'].values.reshape((len(X), )) # train-test split split = StratifiedShuffleSplit(test_size=0.1, random_state=random_seed) for train_index, test_index in split.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # initilize sampling models smt = SMOTETomek(random_state=random_seed) X_res, y_res = smt.fit_sample(X_train, y_train) # Ridge Logstic Regression ridge = RidgeClassifierCV(alphas=np.geomspace(1e-5, 10, 100), cv=10, class_weight=None) ridge.fit(X_res, y_res) print('\n', classification_report_imbalanced(y_test, ridge.predict(X_test), target_names=['normal', 'fraud'])) # SVM svm = LinearSVC(dual=False, verbose=1, random_state=random_seed, max_iter=int(1e6), class_weight=None) svm.fit(X_res, y_res) print('\n', classification_report_imbalanced(y_test, svm.predict(X_test), target_names=['normal', 'fraud'])) # Random Forest
prec_knn = precision_score(test_group, pred_knn, average=None) print("The accuracy of KNN Classier : " + str(acc_knn)) for i, j in zip(np.nditer(prec_knn), groups): print("The precision of KNN Classifier for ALL subtype " + j + ": " + str(round(float(i), 3))) print("\n\n\n") print( "##############################################################################" ) ############################################################################## ############################################################################## ############################################################################## ############################################################################## ridge = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 0.5, 1.0], cv=10).fit(z_train_param, train_group) print("Ridge Classifier score train: " + ridge.score(z_train_param, train_group)) pred_ridge = ridge.predict(z_test_param) print("Ridge Classifier score test: " + ridge.score(z_test_param, test_group)) acc_ridge = accuracy_score(test_group, pred_ridge, normalize=True) prec_ridge = precision_score(test_group, pred_naive, average=None) print("The accuracy of Ridge Classifier : " + str(acc_ridge)) for i, j in zip(np.nditer(prec_ridge), groups): print("The precision of Ridge Classifier for ALL subtype " + j + ": " + str(round(float(i), 3))) print("\n\n\n")
#print(labels) #print(counts_test) tfidf_transformer = TfidfTransformer(use_idf=False).fit(counts_train) X_train_tfidf = tfidf_transformer.fit_transform(counts_train) X_test_tfidf = tfidf_transformer.fit_transform(counts_test) clf1 = MultinomialNB() clf2 = OneVsRestClassifier(svm.SVC(gamma='scale', decision_function_shape='ovo')) clf3 = svm.LinearSVC(multi_class='ovr', max_iter=3000) clf4 = OneVsRestClassifier(MLPClassifier()) clf5 = SGDClassifier(n_jobs=7, loss="hinge", penalty="l2", max_iter=3000) #clf6 = KNeighborsClassifier(n_neighbors=3) clf7 = OneVsRestClassifier(XGBClassifier(max_depth=10,colsample_bytree=0.9)) #clf7 = XGBClassifier(learning_rate =0.01,n_estimators=5000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.8,colsample_bytree=0.8,reg_alpha=0.005,objective= 'binary:logistic',nthread=4,scale_pos_weight=1,seed=27) clf8 = RidgeClassifierCV() clf = [clf1, clf2, clf3, clf4, clf5, clf7, clf8] arr = ['MultinomialNB','SVC','LinearSVC','MLPClassifier','SGDClassifier','XGBClassifier','RidgeClassifierCV'] #predictors = [('nb',clf1), ('svc',clf2), ('lsvc',clf3), ('mlp',clf4), ('sgd',clf5), ('xgbc',clf7)] result=[] maxpred = [] maxp = 0; maxx = 0; predarr = [] for x in range(len(clf)): clf[x].fit(counts_train,labels) pred=clf[x].predict(counts_test) predarr.append(pred) print(arr[x]) #print('predict:',pred)
def __int__(self, **kw_args): super(Ridge, self).__init__() self.alpha = kw_args.get("alpha", 1) self.model = OneVsRestClassifier(RidgeClassifierCV(self.alpha))
nearestCentroidParams = { 'shrink_threshold': [None] + list(map(lambda x: x / 10., range(1, 10))) } svcParams = { 'C': uniform(scale=10), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': geom(p=.5) } dtParams = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_features': ['sqrt', 'log2', None] } models1 = [GaussianNB(), LogisticRegressionCV(), RidgeClassifierCV()] n_iter = 25 print('Making models...', end='', flush=True) models2 = [ GridSearchCV(AdaBoostClassifier(), adaBoostParams), RandomizedSearchCV(MLPClassifier(), mlpParams, n_iter=n_iter, n_jobs=2), RandomizedSearchCV(PassiveAggressiveClassifier(), passiveAggParams, n_iter=n_iter, n_jobs=2), GridSearchCV(SGDClassifier(), sgdParams), GridSearchCV(BaggingClassifier(), baggingParams), RandomizedSearchCV(ExtraTreesClassifier(), extraTreesParams, n_iter=n_iter,
def _make_estimator(num_kernels, random_state): return make_pipeline( Rocket(num_kernels=num_kernels, random_state=random_state), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), )
def _fit(self, X, y): """Build a pipeline containing the Rocket transformer and RidgeClassifierCV. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ _, n_dims, _ = X.shape if self.rocket_transform == "rocket": rocket = Rocket( num_kernels=self.num_kernels, random_state=self.random_state, n_jobs=self._threads_to_use, ) elif self.rocket_transform == "minirocket": if n_dims > 1: rocket = MiniRocketMultivariate( num_kernels=self.num_kernels, max_dilations_per_kernel=self.max_dilations_per_kernel, random_state=self.random_state, n_jobs=self._threads_to_use, ) else: rocket = MiniRocket( num_kernels=self.num_kernels, max_dilations_per_kernel=self.max_dilations_per_kernel, random_state=self.random_state, n_jobs=self._threads_to_use, ) elif self.rocket_transform == "multirocket": if n_dims > 1: rocket = MultiRocketMultivariate( num_kernels=self.num_kernels, max_dilations_per_kernel=self.max_dilations_per_kernel, n_features_per_kernel=self.n_features_per_kernel, random_state=self.random_state, n_jobs=self._threads_to_use, ) else: rocket = MultiRocket( num_kernels=self.num_kernels, max_dilations_per_kernel=self.max_dilations_per_kernel, n_features_per_kernel=self.n_features_per_kernel, random_state=self.random_state, n_jobs=self._threads_to_use, ) else: raise ValueError( f"Invalid Rocket transformer: {self.rocket_transform}") self._pipeline = rocket_pipeline = make_pipeline( rocket, RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) return self
train_time_saste_ridge = [] test_time_saste_ridge = [] # free some memory del train_ds del test_ds print('Executing:', dataset) for _ in range(nb_run_per_dataset): gc.collect() # ----------------- SAST with ridge ----------------------- if f'SASTE-Ridge{dataset}' not in to_skip: clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) saste = SASTEnsemble(cand_length_list=combination_list, nb_inst_per_class=nb_inst_per_class, random_state=None, classifier=clf, n_jobs=-1) train_start = time.time() saste.fit(X_train, y_train) train_time_saste_ridge.append(time.time() - train_start) test_start = time.time() acc = saste.score(X_test, y_test)
assert len(ridge_cv.coef_.shape) == 1 assert type(ridge_cv.intercept_) == np.float64 cv = KFold(5) ridge_cv.set_params(cv=cv) ridge_cv.fit(filter_(X_diabetes), y_diabetes) ridge_cv.predict(filter_(X_diabetes)) assert len(ridge_cv.coef_.shape) == 1 assert type(ridge_cv.intercept_) == np.float64 @pytest.mark.parametrize( "ridge, make_dataset", [(RidgeCV(store_cv_values=False), make_regression), (RidgeClassifierCV(store_cv_values=False), make_classification)]) def test_ridge_gcv_cv_values_not_stored(ridge, make_dataset): # Check that `cv_values_` is not stored when store_cv_values is False X, y = make_dataset(n_samples=6, random_state=42) ridge.fit(X, y) assert not hasattr(ridge, "cv_values_") @pytest.mark.parametrize("ridge, make_dataset", [(RidgeCV(), make_regression), (RidgeClassifierCV(), make_classification)]) @pytest.mark.parametrize("cv", [None, 3]) def test_ridge_best_score(ridge, make_dataset, cv): # check that the best_score_ is store X, y = make_dataset(n_samples=6, random_state=42) ridge.set_params(store_cv_values=False, cv=cv)
def __init__(self, loss_function='logistic', l2_regularization=None, weight_estimator='lr', smoothing=True, clip_max_value=-1, kernel_type='rbf', bandwidth=1): """ Select a particular type of importance-weighted classifier. Parameters ---------- loss : str loss function for weighted classifier, options: 'logistic', 'quadratic', 'hinge' (def: 'logistic') l2_regularization : float l2-regularization parameter value (def:0.01) iwe : str importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm', 'kde' (def: 'lr') smoothing : bool whether to apply Laplace smoothing to the nearest-neighbour importance-weight estimator (def: True) clip : float maximum allowable importance-weight value; if set to -1, then the weights are not clipped (def:-1) kernel_type : str what type of kernel to use for kernel density estimation or kernel mean matching, options: 'diste', 'rbf' (def: 'rbf') bandwidth : float kernel bandwidth parameter value for kernel-based weight estimators (def: 1) Returns ------- None """ self.loss = loss_function self.l2 = l2_regularization self.iwe = weight_estimator self.smoothing = smoothing self.clip = clip_max_value self.kernel_type = kernel_type self.bandwidth = bandwidth # Initialize untrained classifiers based on choice of loss function if self.loss in ('lr', 'logr', 'logistic'): if l2_regularization: # Logistic regression model self.clf = LogisticRegression(C=self.l2, solver='lbfgs') else: # Logistic regression model self.clf = LogisticRegressionCV(cv=5, solver='lbfgs') elif self.loss in ('squared', 'qd', 'quadratic'): if l2_regularization: # Least-squares model with fixed regularization self.clf = RidgeClassifier(alpha=self.l2) else: # Least-squares model, cross-validated for regularization self.clf = RidgeClassifierCV(cv=5) elif self.loss in ('hinge', 'linsvm', 'linsvc'): # Linear support vector machine self.clf = LinearSVC() else: # Other loss functions are not implemented raise NotImplementedError('Loss function not implemented.') # Whether model has been trained self.is_trained = False # Initalize empty weight attribute self.iw = []
# Split dataset to 8:2 X_train, X_test, Y_train ,Y_test = train_test_split(X, y1, test_size=0.3) clf_rf = RandomForestClassifier().fit(X_train, Y_train) print('==== RandomForest ====') print(clf_rf.score(X_train, Y_train) ) print(clf_rf.score(X_test, Y_test) ) print('-' * 30) clf_et = ExtraTreesClassifier().fit(X_train, Y_train) print('==== ExtraTrees ====') print(clf_et.score(X_train, Y_train) ) print(clf_et.score(X_test, Y_test) ) print('-' * 30) clf_rl = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, Y_train) print('==== Ridge ====') print(clf_rl.score(X_train, Y_train) ) print(clf_rl.score(X_test, Y_test) ) print('-' * 30) clf_svm = SVC().fit(X_train, Y_train) print('==== SVC ====') print(clf_svm.score(X_train, Y_train) ) print(clf_svm.score(X_test, Y_test) ) print('-' * 30) Y_pred = clf_rl.predict(X_test) wrong_case = []
LogisticRegression(multi_class="multinomial", solver="newton-cg", max_iter=500), "MultinomialLogisticRegressionAudit") build_audit(LogisticRegressionCV(multi_class="ovr"), "OvRLogisticRegressionAudit") build_audit( BaggingClassifier(LogisticRegression(), random_state=13, n_estimators=3, max_features=0.5), "LogisticRegressionEnsembleAudit") build_audit(GaussianNB(), "NaiveBayesAudit") build_audit(OneVsRestClassifier(LogisticRegression()), "OneVsRestAudit") build_audit(RandomForestClassifier(random_state=13, min_samples_leaf=3), "RandomForestAudit", flat=True) build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba=False) build_audit( BaggingClassifier(RidgeClassifier(random_state=13), random_state=13, n_estimators=3, max_features=0.5), "RidgeEnsembleAudit") build_audit(SVC(), "SVCAudit", with_proba=False) build_audit( VotingClassifier([("dt", DecisionTreeClassifier(random_state=13)), ("nb", GaussianNB()), ("lr", LogisticRegression())], voting="soft", weights=[3, 1, 2]), "VotingEnsembleAudit") build_audit(OptimalXGBClassifier(objective="binary:logistic", ntree_limit=71, random_state=13), "XGBAudit",
data_pix, spacial_pix, data, spacial_data = silly_gen(denoise=True) # mb = MultiLabelBinarizer() # spacial_pix_L = spacial_pix.astype('int') # spacial_pix_L = spacial_pix_L.tolist() # spacial_pix = mb.fit_transform(spacial_pix_L) indices = np.random.permutation(data_pix.shape[0]) training_idx, test_idx = indices[:1900], indices[1900:] X_train, X_test = data_pix[training_idx, :], data_pix[test_idx, :] y_train, y_test = spacial_pix[training_idx], spacial_pix[test_idx] # X_train, X_test, y_train, y_test = train_test_split(data_pix, spacial_pix, test_size=.23, random_state=seed) est_l1 = [('etr', ExtraTreesClassifier(n_jobs=1)), ('rfr', RandomForestClassifier(n_jobs=1)), ('mlp', MLPClassifier(tol=1e-4)), ('svc', SVC(tol=1e-4, degree=9)), ('rdc', RidgeClassifierCV()), ('gbc', GradientBoostingClassifier()), ('ada', AdaBoostClassifier()), ('svc', SVC(tol=1e-4, degree=7, kernel='linear')), ('bag', BaggingClassifier(n_jobs=1))] ests_1 = { 'case-1': est_l1, # 'case-2': est_l1, # 'case-3': est_l1, # 'case-4': est_l1 } r = uniform(0, 30) d = randint(2, 10) f = randint(100, 200) e = uniform(0, 3)