예제 #1
0
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score

if __name__ == '__main__':
    data = pd.read_csv('wine.data', header=None)
    x, y = data.iloc[:, 1:], data[0]
    x = MinMaxScaler().fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        random_state=1,
                                                        train_size=0.7)

    lr = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), cv=3)
    lr.fit(x_train, y_train.ravel())
    print(u'参数alpha=%.2f' % lr.alpha_)
    y_train_pred = lr.predict(x_train)
    y_test_pred = lr.predict(x_test)
    print(u'Logistic回归训练集准确率:', accuracy_score(y_train, y_train_pred))
    print(u'Logistic回归测试集准确率:', accuracy_score(y_test, y_test_pred))

    rf = RandomForestClassifier(n_estimators=100,
                                max_depth=8,
                                min_samples_split=5,
                                oob_score=True)
    rf.fit(x_train, y_train.ravel())
    print(u'OOB Score=%.5f' % rf.oob_score_)
    y_train_pred = rf.predict(x_train)
    y_test_pred = rf.predict(x_test)
        assert flt_features == (all_features - {f})
        return True
    return False


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')],
    [LogisticRegression(random_state=42, fit_intercept=False)],
    [LogisticRegressionCV(random_state=42)],
    [SGDClassifier(**SGD_KWARGS)],
    [SGDClassifier(loss='log', **SGD_KWARGS)],
    [PassiveAggressiveClassifier(random_state=42)],
    [Perceptron(random_state=42)],
    [RidgeClassifier(random_state=42)],
    [RidgeClassifierCV()],
    [LinearSVC(random_state=42)],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
])
def test_explain_linear(newsgroups_train, clf):
    assert_multiclass_linear_classifier_explained(newsgroups_train, clf,
                                                  explain_prediction)
    if isinstance(clf, OneVsRestClassifier):
        assert_multiclass_linear_classifier_explained(
            newsgroups_train, clf, explain_prediction_sklearn)


@pytest.mark.parametrize(['clf'], [
    [LogisticRegression(random_state=42)],
    [LogisticRegressionCV(random_state=42)],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
예제 #3
0
# These are default parameters,
# we initialize an instance here just to suppress warnings
LogisticCV = LogisticRegressionCV(solver='lbfgs',
                                  multi_class='auto',
                                  n_jobs=-1,
                                  max_iter=200,
                                  Cs=(0.0001, 0.001, 0.01, 0.1),
                                  cv=5,
                                  class_weight='balanced')
Logistic = LogisticRegression(solver='lbfgs',
                              multi_class='auto',
                              n_jobs=-1,
                              max_iter=200,
                              class_weight='balanced')
Ridge = RidgeClassifierCV(alphas=(0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0))
LDA = LinearDiscriminantAnalysis()
QDA = QuadraticDiscriminantAnalysis(reg_param=0.0001)
RBF = SVC(kernel='rbf',
          gamma='scale',
          C=0.1,
          class_weight='balanced',
          cache_size=1000,
          probability=True)
SVM_Poly2 = SVC(kernel='poly',
                degree=2,
                gamma='scale',
                C=0.1,
                class_weight='balanced',
                cache_size=1000,
                probability=True)
예제 #4
0
def build_classifiers(exclude, scale, feature_selection, nCols):
    '''
    Input:
    - exclude: list of names of classifiers to exclude from the analysis
    - scale: True or False. Scale data before fitting classifier
    - feature_selection: True or False. Run feature selection before
    fitting classifier
    - nCols: Number of columns in input dataset to classifiers

    Output:
    Dictionary with classifier name as keys.
    - 'clf': Classifier object
    - 'parameters': Dictionary with parameters of 'clf' as keys
    '''
    classifiers = collections.OrderedDict()

    '''Neural Networks'''
    if 'Multilayer Perceptron' not in exclude:
        classifiers['Multilayer Perceptron'] = {
            'clf': MLPClassifier(),
            'parameters': {'hidden_layer_sizes': [(100, 50), (50, 25)],
                           'max_iter': [500]}
        }

    '''Neighbor Methods'''
    if 'Nearest Neighbors' not in exclude:
        classifiers['Nearest Neighbors'] = {
            'clf': KNeighborsClassifier(),
            'parameters': {'n_neighbors': [1, 5, 10, 20]}}

    if 'Radius Neighbors' not in exclude:
        classifiers['Radius Neighbors'] = {
            'clf': RadiusNeighborsClassifier(outlier_label=0),
            'parameters': {}}

    '''SVM'''
    if 'SVM' not in exclude:
        classifiers['SVM'] = {
            'clf': SVC(C=1, probability=True, cache_size=10000,
                       class_weight='balanced'),
            'parameters': {'kernel': ['rbf', 'poly'],
                           'C': [0.01, 0.1, 1]}}

    if 'Linear SVM' not in exclude:
        classifiers['Linear SVM'] = {
            'clf': LinearSVC(dual=False, class_weight='balanced'),
            'parameters': {'C': [0.01, 0.1, 1],
                           'penalty': ['l1', 'l2']}}

    '''Tree Methods'''
    if 'Decision Tree' not in exclude:
        classifiers['Decision Tree'] = {
            'clf': DecisionTreeClassifier(max_depth=None,
                                          max_features='auto'),
            'parameters': {}}

    if 'Random Forest' not in exclude:
        classifiers['Random Forest'] = {
            'clf': RandomForestClassifier(max_depth=None,
                                          n_estimators=10,
                                          max_features='auto'),
            'parameters': {'n_estimators': list(range(5, 20))}}

    '''Ensemble Methods'''
    if 'Ada Boost' not in exclude:
        classifiers['Ada Boost'] = {
            'clf': AdaBoostClassifier(),
            'parameters': {}}

    if 'Bagging' not in exclude:
        classifiers['Ada Boost'] = {
            'clf': BaggingClassifier(),
            'parameters': {}}

    if 'Gradient Boost' not in exclude:
        classifiers['Gradient Boost'] = {
            'clf': GradientBoostingClassifier(),
            'parameters': {}}

    ''' Linear Models '''
    if 'Logistic Regression' not in exclude:
        classifiers['Logistic Regression'] = {
            'clf': LogisticRegression(fit_intercept=True, solver='lbfgs',
                                      penalty='l2'),
            'parameters': {'C': [0.001, 0.1, 1]}}

    if 'Ridge Classification' not in exclude:
        classifiers['Ridge Classification'] = {
            'clf': RidgeClassifier(fit_intercept=True),
            'parameters': {}}

    if 'Ridge Classification CV' not in exclude:
        classifiers['Ridge Classification CV'] = {
            'clf': RidgeClassifierCV(fit_intercept=True),
            'parameters': {}}

    if 'Passive Aggressive' not in exclude:
        classifiers['Passive Aggressive Classifier'] = {
            'clf': PassiveAggressiveClassifier(),
            'parameters': {}}

    if 'Perceptron' not in exclude:
        classifiers['Perceptron'] = {
            'clf': Perceptron(),
            'parameters': {}}

    '''Naive Bayes'''
    if 'Gaussian Naive Bayes' not in exclude:
        classifiers['Gaussian Naive Bayes'] = {
            'clf': GaussianNB(),
            'parameters': {}}

    if 'Bernoulli Naive Bayes' not in exclude:
        classifiers['Bernoulli Naive Bayes'] = {
            'clf': BernoulliNB(),
            'parameters': {}}

    '''Discriminant Analysis'''
    if 'LDA' not in exclude:
        classifiers['LDA'] = {
            'clf': LinearDiscriminantAnalysis(),
            'parameters': {}}

    if 'QDA' not in exclude:
        classifiers['QDA'] = {
            'clf': QuadraticDiscriminantAnalysis(),
            'parameters': {}}

    if 'Gaussian Process' not in exclude:
        classifiers['Guassian Process'] = {
            'clf': GaussianProcessClassifier(),
            'parameters': {}}


    # classifiers['Voting'] = {}

    def name(x):
        """
        :param x: The name of the classifier
        :return: The class of the final estimator in lower case form
        """
        return x['clf']._final_estimator.__class__.__name__.lower()

    for key, val in classifiers.items():
        if not scale and not feature_selection:
            break
        steps = []
        if scale:
            steps.append(StandardScaler())
        if feature_selection:
            steps.append(SelectKBest(f_regression, k='all'))
        steps.append(classifiers[key]['clf'])
        classifiers[key]['clf'] = make_pipeline(*steps)
        # Reorganize paramenter list for grid search
        new_dict = {}
        for keyp in classifiers[key]['parameters']:
            new_dict[name(classifiers[key]) + '__' +
                     keyp] = classifiers[key]['parameters'][keyp]
        classifiers[key]['parameters'] = new_dict
        if nCols > 5 and feature_selection:
            classifiers[key]['parameters']['selectkbest__k'] = np.linspace(
                np.round(nCols / 5), nCols, 5).astype('int').tolist()

    return classifiers
y_pred = classifier.predict(X_test_mean)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_median,y_train)
y_pred = classifier.predict(X_test_median)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_mode,y_train)
y_pred = classifier.predict(X_test_mode)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

from sklearn.linear_model import RidgeClassifierCV
classifier = RidgeClassifierCV()
classifier.fit(X_train_0,y_train)
y_pred = classifier.predict(X_test_0)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_mean,y_train)
y_pred = classifier.predict(X_test_mean)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))

classifier.fit(X_train_median,y_train)
y_pred = classifier.predict(X_test_median)
y_pred = np.round(y_pred).flatten()
print(accuracy_score(y_test, y_pred))
예제 #6
0
               verbose=True),
 "shapelet":
 make_pipeline(TruncationTransformer(lower=1000),
               ContractedShapeletTransform(
                   time_contract_in_mins=10,
                   num_candidates_to_sample_per_case=10,
                   verbose=2,
                   random_state=1),
               RandomForestClassifier(n_estimators=100,
                                      n_jobs=-1,
                                      random_state=1),
               verbose=True),
 "rocket":
 make_pipeline(TruncationTransformer(lower=MAX_LENGTH),
               Rocket(random_state=1),
               RidgeClassifierCV(alphas=np.logspace(-3, 3, 10),
                                 normalize=True),
               verbose=True),
 "mr-seql":
 make_pipeline(TruncationTransformer(lower=MAX_LENGTH),
               MrSEQLClassifier(symrep=['sax', 'sfa']),
               verbose=True),
 "full_features":
 make_pipeline(
     TruncationTransformer(lower=MAX_LENGTH),
     ColumnEnsembleClassifier([
         ("features_0",
          make_pipeline(TSFreshFeatureExtractor(
              default_fc_parameters="efficient",
              show_warnings=False,
              n_jobs=-1),
                        RandomForestClassifier(n_jobs=-1, random_state=1),
class PipelineComponents:
    """Key-value pairs used by the ExtendedPipeline class to determine which components to use in the pipeline"""
    models = {
        'XGBClassifier':
        XGBClassifier(),
        'ElasticNetClassifier':
        SGDClassifier(penalty="elasticnet",
                      l1_ratio=0.5,
                      loss='log',
                      tol=0.0001),
        'RidgeClassifierCV':
        RidgeClassifierCV(),
        'Perceptron':
        Perceptron(max_iter=2500, penalty='l2'),
        'PassiveAggressiveClassifier':
        PassiveAggressiveClassifier(),
        'KNeighborsClassifier':
        KNeighborsClassifier(n_neighbors=50),
        'RandomForestClassifier':
        RandomForestClassifier(n_estimators=1000),
        'LinearSVC':
        LinearSVC(dual=False, penalty='l2', tol=1e-3),
        'SGDClassifier':
        SGDClassifier(alpha=.0001, penalty='l2'),
        'SGDClassifier_elasticnet':
        SGDClassifier(alpha=.0001, penalty="elasticnet"),
        'NearestCentroid':
        NearestCentroid(),
        'MultinomialNB':
        MultinomialNB(alpha=.01),
        'BernoulliNB':
        BernoulliNB(alpha=.01),
        'ComplementNB':
        ComplementNB(alpha=.1),
        'SVC':
        SVC(),
        'LogisticRegression':
        LogisticRegression(solver='lbfgs', max_iter=5000, penalty='l2'),
        'LogisticRegressionCV':
        LogisticRegressionCV(max_iter=5000, n_jobs=-1)
    }

    vectorizers = {
        'hashing':
        HashingVectorizer(tokenizer=dummy, preprocessor=dummy),
        'count':
        CountVectorizer(min_df=5,
                        tokenizer=dummy,
                        preprocessor=dummy,
                        max_df=0.5,
                        ngram_range=(1, 2),
                        max_features=1000),
        #'dummy':GloveTokenize()
    }

    stemmers = {
        'porter': StemTokenizer(),
        'snowball': SnowballTokenizer(),
        'lemma': LemmaTokenizer(),
        # interchangable
        "No Stemmer": dummy,
        None: dummy
    }
    transformers = {
        'tfidf': TfidfTransformer(norm='l2', use_idf=True),
        'minmax': MinMaxScaler(),
        'normal': Normalizer(norm='l2'),
        'robust': RobustScaler(),
        'max': MaxAbsScaler(),
        None: 'passthrough',
        'passthrough': 'passthrough'
    }
예제 #8
0
# from sklearn.neighbors import NearestCentroid
from sklearn.cluster import KMeans

# from sklearn.linear_model import SGDClassifier
# from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import LassoCV
# from sklearn.svm import LinearSVC
# from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
# from sklearn.linear_model import Perceptron
# from sklearn.linear_model import PassiveAggressiveClassifier
# from sklearn.utils.extmath import density

models = [
    ('ridge', RidgeClassifierCV(normalize=True)),
    ('lasso', LassoCV()),
]

parameters = {
    'ridge__C': (0.3, 1, 3, 10),
    'lasso__C': (1, 3),
}


def train_data(X, y, refit=False, test_size=0.1):
    print("\n[ start training: {} ]".format(datetime.now()))
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
예제 #9
0
input_df = pandas.read_csv(sys.argv[1])
labels = input_df['FEATURE_LABELS'].get_values()
features = input_df.drop(columns=["FEATURE_LABELS"]).get_values()


def string_to_vector(s):
    try:
        cleaned_s = re.sub("[\ ]", '', s)
    except:
        return []
    return cleaned_s.split(",")


labels = list(map(string_to_vector, labels))

y = MultiLabelBinarizer()
clf = RidgeClassifierCV()

score = clf.fit(features,
                y.fit_transform(labels)).score(features,
                                               y.fit_transform(labels),
                                               average="samples")

f = open("ridge_clf.txt", 'w')
f.write(pickle.dumps(clf))
f.close()

f = open("binarizer.txt", 'w')
f.write(pickle.dumps(y))
f.close()
예제 #10
0
feature_cols = ['size','pole','mean','stddev','b_mean','g_mean','r_mean','b_stddev','g_stddev','r_stddev','square','ratiowh','ratioarea','approxlen','numangle','numangle90','numangle70']
X = data[feature_cols]

scaler = StandardScaler()
X = scaler.fit_transform(X)# Features

y = data.label # Target variable

# from sklearn.model_selection import train_test_split
# X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
X_train = X
y_train = y


from sklearn.linear_model import RidgeClassifierCV
svclassifier = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1],class_weight='balanced')
model = svclassifier.fit(X_train, y_train)


from sklearn import metrics

# instantiate the model (using the default parameters)


# fit the model with data
model.fit(X_train, y_train)



datatest = pd.read_csv("./feature_810_all.csv")
datatest = datatest.dropna()
예제 #11
0
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV, SGDClassifier, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
# list of tuples: the first element is a string, the second is an object
estimators = [('LogisticRegression', LogisticRegression()),('RidgeClassifier', RidgeClassifier()), ('RidgeClassifierCV', RidgeClassifierCV()),\
              ('RandomForestClassifier', RandomForestClassifier()), ('GradientBoostingClassifier', GradientBoostingClassifier())]
from sklearn.model_selection import train_test_split

data = pd.read_csv('HR_Data.csv')

# Convert all nominal to numeric.
data['sales'].replace([
    'sales', 'accounting', 'hr', 'technical', 'support', 'management', 'IT',
    'product_mng', 'marketing', 'RandD'
], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                      inplace=True)
data['salary'].replace(['low', 'medium', 'high'], [0, 1, 2], inplace=True)
###########################################
# Train & Test Data

data_X = data.copy()
data_y = data_X['left']
del data_X['left']

train_X, test_X, train_y, test_y = train_test_split(data_X,
                                                    data_y,
                                                    test_size=0.2,
                                                    random_state=1234)
예제 #12
0
def set_classifier(cls, resampleId):
    """
    Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if
    you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif.
    This may well get superceded, it is just how e have always done it
    :param cls: String indicating which classifier you want
    :return: A classifier.

    """
    if cls.lower() == "pf":
        return pf.ProximityForest(random_state=resampleId)
    elif cls.lower() == "pt":
        return pf.ProximityTree(random_state=resampleId)
    elif cls.lower() == "ps":
        return pf.ProximityStump(random_state=resampleId)
    elif cls.lower() == "rise":
        return fb.RandomIntervalSpectralForest(random_state=resampleId)
    elif cls.lower() == "tsf":
        return ib.TimeSeriesForest(random_state=resampleId)
    elif cls.lower() == "cif":
        return CanonicalIntervalForest(random_state=resampleId)
    elif cls.lower() == "boss":
        return BOSSEnsemble(random_state=resampleId)
    elif cls.lower() == "cboss":
        return ContractableBOSS(random_state=resampleId)
    elif cls.lower() == "tde":
        return TemporalDictionaryEnsemble(random_state=resampleId)
    elif cls.lower() == "st":
        return st.ShapeletTransformClassifier(time_contract_in_mins=1500)
    elif cls.lower() == "dtwcv":
        return nn.KNeighborsTimeSeriesClassifier(metric="dtwcv")
    elif cls.lower() == "ee" or cls.lower() == "elasticensemble":
        return dist.ElasticEnsemble()
    elif cls.lower() == "tsfcomposite":
        # It defaults to TSF
        return ensemble.TimeSeriesForestClassifier()
    elif cls.lower() == "risecomposite":
        steps = [
            ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)),
            (
                "transform",
                FeatureUnion([
                    (
                        "acf",
                        make_row_transformer(
                            FunctionTransformer(func=acf_coefs,
                                                validate=False)),
                    ),
                    (
                        "ps",
                        make_row_transformer(
                            FunctionTransformer(func=powerspectrum,
                                                validate=False)),
                    ),
                ]),
            ),
            ("tabularise", Tabularizer()),
            ("clf", DecisionTreeClassifier()),
        ]
        base_estimator = Pipeline(steps)
        return ensemble.TimeSeriesForestClassifier(estimator=base_estimator,
                                                   n_estimators=100)
    elif cls.lower() == "rocket":
        rocket_pipeline = make_pipeline(
            Rocket(random_state=resampleId),
            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True),
        )
        return rocket_pipeline
    else:
        raise Exception("UNKNOWN CLASSIFIER")
예제 #13
0
skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = 8)

lr = LogisticRegression(n_jobs=-1)

rf = RandomForestClassifier(n_jobs=-1)
ada = AdaBoostClassifier()
lgbm = LGBMClassifier()
xgb = XGBClassifier(n_jobs=-1)
cat = CatBoostClassifier(verbose = False)
etc=ExtraTreesClassifier()
gbc =GradientBoostingClassifier()
nb = GaussianNB()
mnb=MultinomialNB()
cnb1=ComplementNB()
bnb=BernoulliNB()
cnb2=CategoricalNB()
qda = QuadraticDiscriminantAnalysis()
lda = LinearDiscriminantAnalysis()
rccv = RidgeClassifierCV()
rc = RidgeClassifier()
pf = PolynomialFeatures(interaction_only=False, degree=1, include_bias=False,)
sc = StandardScaler()

classifiers = [rc, rccv,lda, cat, lgbm, xgb, ada, rf, lr, etc, gbc]
for clf in classifiers:
    #pipe = Pipeline([('impute',si),('extract features',pf), ('scale', sc), ('classify', clf)]) #, ('extract features',pf), ('scale', sc)
    pipe = Pipeline([('classify',clf)])#, ('std',sc)
    cvs = cross_val_score(pipe, X[['a','b','c']], y, scoring='roc_auc', cv=3, n_jobs=-1)
    print(np.std(cvs), np.mean(cvs), clf)
    print()
예제 #14
0
def ridge_classifier_with_cross_validation(x_train, y_train):
    model = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1])
    model.fit(x_train, y_train)
    weights = model.coef_, model.intercept_
    score = model.score(x_train, y_train)
    return weights, score
예제 #15
0
from sklearn.svm import SVR, LinearSVC

try:
    from sklearn.metrics import check_scoring
except ImportError:
    # for scikit-learn 0.18 and 0.19
    from sklearn.metrics.scorer import check_scoring

# Regression
ridge = RidgeCV()
svr = SVR(kernel='linear')
# Classification
svc = LinearSVC()
logistic_l1 = LogisticRegression(penalty='l1')
logistic_l2 = LogisticRegression(penalty='l2')
ridge_classifier = RidgeClassifierCV()
random_forest = RandomForestClassifier()

regressors = {'ridge': (ridge, []),
              'svr': (svr, 'C')}
classifiers = {'svc': (svc, 'C'),
               'logistic_l1': (logistic_l1, 'C'),
               'logistic_l2': (logistic_l2, 'C'),
               'ridge_classifier': (ridge_classifier, [])}
# Create a test dataset
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
# Create different targets
y_regression = rng.rand(100)
y_classification = np.hstack([[-1] * 50, [1] * 50])
y_classification_str = np.hstack([['face'] * 50, ['house'] * 50])
train_features_reduced = pd.DataFrame(data, columns=columns)
#Reduce validation features
data = model.transform(val_set)
val_set_reduced = pd.DataFrame(data, columns=columns)

# # **Selecting classifiers**

# In[ ]:

import warnings
warnings.filterwarnings("ignore")
#InteractiveShell.ast_node_interactivity = 'last_expr'

classifiers = {
    'LogReg': LogisticRegression(),
    'RidgeClassifier': RidgeClassifierCV(),
    'KNN': KNeighborsClassifier(),
    'SVC': SVC(gamma='auto'),
    'GaussianNB': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
    'BaggingClassifier': BaggingClassifier(n_estimators=100),
    'XGB': XGBClassifier(),
    'LDA': LinearDiscriminantAnalysis()
}

scoring = {
    'accuracy': make_scorer(accuracy_score),
예제 #17
0
# The logistic regression
from sklearn.linear_model import LogisticRegression, RidgeClassifier, \
    RidgeClassifierCV
logistic = LogisticRegression(C=1., penalty="l1")
logistic_50 = LogisticRegression(C=50., penalty="l1")
logistic_l2 = LogisticRegression(C=1., penalty="l2")

logistic_cv = GridSearchCV(LogisticRegression(C=1., penalty="l1"),
                           param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]},
                      scoring='f1')
logistic_l2_cv = GridSearchCV(LogisticRegression(C=1., penalty="l1"),
                           param_grid={'C': [.1, .5, 1., 5., 10., 50., 100.]},
                      scoring='f1')

ridge = RidgeClassifier()
ridge_cv = RidgeClassifierCV()


# Make a data splitting object for cross validation
from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score
cv = LeaveOneLabelOut(session_labels)

classifiers = {'SVC': svm,
               'SVC cv': svm_cv,
               'log l1': logistic,
               'log l1 50': logistic_50,
               'log l1 cv': logistic_cv,
               'log l2': logistic_l2,
               'log l2 cv': logistic_l2_cv,
               'ridge': ridge,
               'ridge cv': ridge_cv}
예제 #18
0
X = data.loc[:, data.columns != 'Class'].values
y = data.loc[:, data.columns == 'Class'].values.reshape((len(X), ))

# train-test split
split = StratifiedShuffleSplit(test_size=0.1, random_state=random_seed)
for train_index, test_index in split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# initilize sampling models
smt = SMOTETomek(random_state=random_seed)
X_res, y_res = smt.fit_sample(X_train, y_train)

# Ridge Logstic Regression
ridge = RidgeClassifierCV(alphas=np.geomspace(1e-5, 10, 100), 
                          cv=10, 
                          class_weight=None)
ridge.fit(X_res, y_res)
print('\n', classification_report_imbalanced(y_test, ridge.predict(X_test), 
                                             target_names=['normal', 'fraud']))

# SVM
svm = LinearSVC(dual=False, verbose=1, 
                random_state=random_seed, 
                max_iter=int(1e6), 
                class_weight=None)
svm.fit(X_res, y_res)
print('\n', classification_report_imbalanced(y_test, svm.predict(X_test), 
                                             target_names=['normal', 'fraud']))

# Random Forest
예제 #19
0
prec_knn = precision_score(test_group, pred_knn, average=None)
print("The accuracy of KNN Classier : " + str(acc_knn))
for i, j in zip(np.nditer(prec_knn), groups):
    print("The precision of KNN Classifier for ALL subtype " + j + ": " +
          str(round(float(i), 3)))
print("\n\n\n")
print(
    "##############################################################################"
)

##############################################################################
##############################################################################
##############################################################################
##############################################################################

ridge = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 0.5, 1.0],
                          cv=10).fit(z_train_param, train_group)
print("Ridge Classifier score train: " +
      ridge.score(z_train_param, train_group))

pred_ridge = ridge.predict(z_test_param)
print("Ridge Classifier score test: " + ridge.score(z_test_param, test_group))

acc_ridge = accuracy_score(test_group, pred_ridge, normalize=True)
prec_ridge = precision_score(test_group, pred_naive, average=None)

print("The accuracy of Ridge Classifier : " + str(acc_ridge))
for i, j in zip(np.nditer(prec_ridge), groups):
    print("The precision of Ridge Classifier for ALL subtype " + j + ": " +
          str(round(float(i), 3)))

print("\n\n\n")
예제 #20
0
#print(labels)
#print(counts_test)

tfidf_transformer = TfidfTransformer(use_idf=False).fit(counts_train)
X_train_tfidf = tfidf_transformer.fit_transform(counts_train)
X_test_tfidf = tfidf_transformer.fit_transform(counts_test)

clf1 = MultinomialNB()
clf2 = OneVsRestClassifier(svm.SVC(gamma='scale', decision_function_shape='ovo'))
clf3 = svm.LinearSVC(multi_class='ovr', max_iter=3000)
clf4 = OneVsRestClassifier(MLPClassifier())
clf5 = SGDClassifier(n_jobs=7, loss="hinge", penalty="l2", max_iter=3000)
#clf6 = KNeighborsClassifier(n_neighbors=3)
clf7 = OneVsRestClassifier(XGBClassifier(max_depth=10,colsample_bytree=0.9))
#clf7 = XGBClassifier(learning_rate =0.01,n_estimators=5000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.8,colsample_bytree=0.8,reg_alpha=0.005,objective= 'binary:logistic',nthread=4,scale_pos_weight=1,seed=27) 
clf8 = RidgeClassifierCV()
clf = [clf1, clf2, clf3, clf4, clf5, clf7, clf8]
arr = ['MultinomialNB','SVC','LinearSVC','MLPClassifier','SGDClassifier','XGBClassifier','RidgeClassifierCV']
#predictors = [('nb',clf1), ('svc',clf2), ('lsvc',clf3), ('mlp',clf4), ('sgd',clf5), ('xgbc',clf7)]

result=[]
maxpred = []
maxp = 0;
maxx = 0;
predarr = []
for x in range(len(clf)):
    clf[x].fit(counts_train,labels)
    pred=clf[x].predict(counts_test)
    predarr.append(pred)
    print(arr[x])
    #print('predict:',pred)
예제 #21
0
    def __int__(self, **kw_args):

        super(Ridge, self).__init__()
        self.alpha = kw_args.get("alpha", 1)
        self.model = OneVsRestClassifier(RidgeClassifierCV(self.alpha))
예제 #22
0
nearestCentroidParams = {
    'shrink_threshold': [None] + list(map(lambda x: x / 10., range(1, 10)))
}

svcParams = {
    'C': uniform(scale=10),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': geom(p=.5)
}
dtParams = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': ['sqrt', 'log2', None]
}

models1 = [GaussianNB(), LogisticRegressionCV(), RidgeClassifierCV()]

n_iter = 25
print('Making models...', end='', flush=True)
models2 = [
    GridSearchCV(AdaBoostClassifier(), adaBoostParams),
    RandomizedSearchCV(MLPClassifier(), mlpParams, n_iter=n_iter, n_jobs=2),
    RandomizedSearchCV(PassiveAggressiveClassifier(),
                       passiveAggParams,
                       n_iter=n_iter,
                       n_jobs=2),
    GridSearchCV(SGDClassifier(), sgdParams),
    GridSearchCV(BaggingClassifier(), baggingParams),
    RandomizedSearchCV(ExtraTreesClassifier(),
                       extraTreesParams,
                       n_iter=n_iter,
예제 #23
0
def _make_estimator(num_kernels, random_state):
    return make_pipeline(
        Rocket(num_kernels=num_kernels, random_state=random_state),
        RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True),
    )
예제 #24
0
    def _fit(self, X, y):
        """Build a pipeline containing the Rocket transformer and RidgeClassifierCV.

        Parameters
        ----------
        X : 3D np.array of shape = [n_instances, n_dimensions, series_length]
            The training data.
        y : array-like, shape = [n_instances]
            The class labels.

        Returns
        -------
        self :
            Reference to self.

        Notes
        -----
        Changes state by creating a fitted model that updates attributes
        ending in "_" and sets is_fitted flag to True.
        """
        _, n_dims, _ = X.shape

        if self.rocket_transform == "rocket":
            rocket = Rocket(
                num_kernels=self.num_kernels,
                random_state=self.random_state,
                n_jobs=self._threads_to_use,
            )
        elif self.rocket_transform == "minirocket":
            if n_dims > 1:
                rocket = MiniRocketMultivariate(
                    num_kernels=self.num_kernels,
                    max_dilations_per_kernel=self.max_dilations_per_kernel,
                    random_state=self.random_state,
                    n_jobs=self._threads_to_use,
                )
            else:
                rocket = MiniRocket(
                    num_kernels=self.num_kernels,
                    max_dilations_per_kernel=self.max_dilations_per_kernel,
                    random_state=self.random_state,
                    n_jobs=self._threads_to_use,
                )
        elif self.rocket_transform == "multirocket":
            if n_dims > 1:
                rocket = MultiRocketMultivariate(
                    num_kernels=self.num_kernels,
                    max_dilations_per_kernel=self.max_dilations_per_kernel,
                    n_features_per_kernel=self.n_features_per_kernel,
                    random_state=self.random_state,
                    n_jobs=self._threads_to_use,
                )
            else:
                rocket = MultiRocket(
                    num_kernels=self.num_kernels,
                    max_dilations_per_kernel=self.max_dilations_per_kernel,
                    n_features_per_kernel=self.n_features_per_kernel,
                    random_state=self.random_state,
                    n_jobs=self._threads_to_use,
                )
        else:
            raise ValueError(
                f"Invalid Rocket transformer: {self.rocket_transform}")

        self._pipeline = rocket_pipeline = make_pipeline(
            rocket,
            RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True),
        )
        rocket_pipeline.fit(X, y)

        return self
예제 #25
0
            train_time_saste_ridge = []
            test_time_saste_ridge = []

            # free some memory
            del train_ds
            del test_ds

            print('Executing:', dataset)
            for _ in range(nb_run_per_dataset):

                gc.collect()

                # ----------------- SAST with ridge -----------------------
                if f'SASTE-Ridge{dataset}' not in to_skip:

                    clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
                    saste = SASTEnsemble(cand_length_list=combination_list,
                                         nb_inst_per_class=nb_inst_per_class,
                                         random_state=None,
                                         classifier=clf,
                                         n_jobs=-1)

                    train_start = time.time()

                    saste.fit(X_train, y_train)

                    train_time_saste_ridge.append(time.time() - train_start)

                    test_start = time.time()

                    acc = saste.score(X_test, y_test)
예제 #26
0
    assert len(ridge_cv.coef_.shape) == 1
    assert type(ridge_cv.intercept_) == np.float64

    cv = KFold(5)
    ridge_cv.set_params(cv=cv)
    ridge_cv.fit(filter_(X_diabetes), y_diabetes)
    ridge_cv.predict(filter_(X_diabetes))

    assert len(ridge_cv.coef_.shape) == 1
    assert type(ridge_cv.intercept_) == np.float64


@pytest.mark.parametrize(
    "ridge, make_dataset",
    [(RidgeCV(store_cv_values=False), make_regression),
     (RidgeClassifierCV(store_cv_values=False), make_classification)])
def test_ridge_gcv_cv_values_not_stored(ridge, make_dataset):
    # Check that `cv_values_` is not stored when store_cv_values is False
    X, y = make_dataset(n_samples=6, random_state=42)
    ridge.fit(X, y)
    assert not hasattr(ridge, "cv_values_")


@pytest.mark.parametrize("ridge, make_dataset",
                         [(RidgeCV(), make_regression),
                          (RidgeClassifierCV(), make_classification)])
@pytest.mark.parametrize("cv", [None, 3])
def test_ridge_best_score(ridge, make_dataset, cv):
    # check that the best_score_ is store
    X, y = make_dataset(n_samples=6, random_state=42)
    ridge.set_params(store_cv_values=False, cv=cv)
예제 #27
0
파일: iw.py 프로젝트: BorgwardtLab/libTLDA
    def __init__(self,
                 loss_function='logistic',
                 l2_regularization=None,
                 weight_estimator='lr',
                 smoothing=True,
                 clip_max_value=-1,
                 kernel_type='rbf',
                 bandwidth=1):
        """
        Select a particular type of importance-weighted classifier.

        Parameters
        ----------
        loss : str
            loss function for weighted classifier, options: 'logistic',
            'quadratic', 'hinge' (def: 'logistic')
        l2_regularization : float
            l2-regularization parameter value (def:0.01)
        iwe : str
            importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
            'kde' (def: 'lr')
        smoothing : bool
            whether to apply Laplace smoothing to the nearest-neighbour
            importance-weight estimator (def: True)
        clip : float
            maximum allowable importance-weight value; if set to -1, then the
            weights are not clipped (def:-1)
        kernel_type : str
            what type of kernel to use for kernel density estimation or kernel
            mean matching, options: 'diste', 'rbf' (def: 'rbf')
        bandwidth : float
            kernel bandwidth parameter value for kernel-based weight
            estimators (def: 1)

        Returns
        -------
        None

        """
        self.loss = loss_function
        self.l2 = l2_regularization
        self.iwe = weight_estimator
        self.smoothing = smoothing
        self.clip = clip_max_value
        self.kernel_type = kernel_type
        self.bandwidth = bandwidth

        # Initialize untrained classifiers based on choice of loss function
        if self.loss in ('lr', 'logr', 'logistic'):

            if l2_regularization:

                # Logistic regression model
                self.clf = LogisticRegression(C=self.l2, solver='lbfgs')

            else:
                # Logistic regression model
                self.clf = LogisticRegressionCV(cv=5, solver='lbfgs')

        elif self.loss in ('squared', 'qd', 'quadratic'):

            if l2_regularization:

                # Least-squares model with fixed regularization
                self.clf = RidgeClassifier(alpha=self.l2)

            else:
                # Least-squares model, cross-validated for regularization
                self.clf = RidgeClassifierCV(cv=5)

        elif self.loss in ('hinge', 'linsvm', 'linsvc'):

            # Linear support vector machine
            self.clf = LinearSVC()

        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss function not implemented.')

        # Whether model has been trained
        self.is_trained = False

        # Initalize empty weight attribute
        self.iw = []
예제 #28
0
# Split dataset to 8:2
X_train, X_test, Y_train ,Y_test = train_test_split(X, y1, test_size=0.3)

clf_rf = RandomForestClassifier().fit(X_train, Y_train)
print('====  RandomForest  ====')
print(clf_rf.score(X_train, Y_train) )
print(clf_rf.score(X_test, Y_test) )
print('-' * 30)

clf_et = ExtraTreesClassifier().fit(X_train, Y_train)
print('====  ExtraTrees  ====')
print(clf_et.score(X_train, Y_train) )
print(clf_et.score(X_test, Y_test) )
print('-' * 30)

clf_rl = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, Y_train)
print('====  Ridge  ====')
print(clf_rl.score(X_train, Y_train) )
print(clf_rl.score(X_test, Y_test) )
print('-' * 30)

clf_svm = SVC().fit(X_train, Y_train)
print('====  SVC  ====')
print(clf_svm.score(X_train, Y_train) )
print(clf_svm.score(X_test, Y_test) )
print('-' * 30)

Y_pred = clf_rl.predict(X_test)


wrong_case = []
예제 #29
0
     LogisticRegression(multi_class="multinomial",
                        solver="newton-cg",
                        max_iter=500), "MultinomialLogisticRegressionAudit")
 build_audit(LogisticRegressionCV(multi_class="ovr"),
             "OvRLogisticRegressionAudit")
 build_audit(
     BaggingClassifier(LogisticRegression(),
                       random_state=13,
                       n_estimators=3,
                       max_features=0.5), "LogisticRegressionEnsembleAudit")
 build_audit(GaussianNB(), "NaiveBayesAudit")
 build_audit(OneVsRestClassifier(LogisticRegression()), "OneVsRestAudit")
 build_audit(RandomForestClassifier(random_state=13, min_samples_leaf=3),
             "RandomForestAudit",
             flat=True)
 build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba=False)
 build_audit(
     BaggingClassifier(RidgeClassifier(random_state=13),
                       random_state=13,
                       n_estimators=3,
                       max_features=0.5), "RidgeEnsembleAudit")
 build_audit(SVC(), "SVCAudit", with_proba=False)
 build_audit(
     VotingClassifier([("dt", DecisionTreeClassifier(random_state=13)),
                       ("nb", GaussianNB()), ("lr", LogisticRegression())],
                      voting="soft",
                      weights=[3, 1, 2]), "VotingEnsembleAudit")
 build_audit(OptimalXGBClassifier(objective="binary:logistic",
                                  ntree_limit=71,
                                  random_state=13),
             "XGBAudit",
예제 #30
0
data_pix, spacial_pix, data, spacial_data = silly_gen(denoise=True)
# mb = MultiLabelBinarizer()
# spacial_pix_L = spacial_pix.astype('int')
# spacial_pix_L = spacial_pix_L.tolist()
# spacial_pix = mb.fit_transform(spacial_pix_L)
indices = np.random.permutation(data_pix.shape[0])
training_idx, test_idx = indices[:1900], indices[1900:]
X_train, X_test = data_pix[training_idx, :], data_pix[test_idx, :]
y_train, y_test = spacial_pix[training_idx], spacial_pix[test_idx]

# X_train, X_test, y_train, y_test = train_test_split(data_pix, spacial_pix, test_size=.23, random_state=seed)

est_l1 = [('etr', ExtraTreesClassifier(n_jobs=1)),
          ('rfr', RandomForestClassifier(n_jobs=1)),
          ('mlp', MLPClassifier(tol=1e-4)), ('svc', SVC(tol=1e-4, degree=9)),
          ('rdc', RidgeClassifierCV()), ('gbc', GradientBoostingClassifier()),
          ('ada', AdaBoostClassifier()),
          ('svc', SVC(tol=1e-4, degree=7, kernel='linear')),
          ('bag', BaggingClassifier(n_jobs=1))]

ests_1 = {
    'case-1': est_l1,
    # 'case-2': est_l1,
    # 'case-3': est_l1,
    # 'case-4': est_l1
}

r = uniform(0, 30)
d = randint(2, 10)
f = randint(100, 200)
e = uniform(0, 3)