示例#1
0
                                        age_scale_param)
fare_scale_param = scaler.fit(df['Fare'].reshape(-1, 1))
df['Fare_scaled'] = scaler.fit_transform(df['Fare'].reshape(-1, 1),
                                         fare_scale_param)

train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
train_np = train_df.as_matrix()

y = train_np[:, 0]
X = train_np[:, 1:]

clf = linear_model.LogisticRegression(C=1.0,
                                      random_state=0,
                                      penalty='l1',
                                      tol=0.000001)
clf.fit(X, y)
#-------------------------------------------------------------------------------------
data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0

tmp_df = data_test[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()

test_X = null_age[:, 1:]
predictedAges = rfr.predict(test_X)
data_test.loc[(data_test.Age.isnull()), 'Age'] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix='Embarked')
示例#2
0

def get_data(filename):
    return picles.Unpickler(open('grantData_hw3/' + filename + '.pickle',
                                 'rb')).load()


train = get_data('training')
testing = get_data('testing')
reduced_set = get_data('reduced')

#Training:
x = train[reduced_set['x']]
y = train['Class']

classifier = linear_model.LogisticRegression(solver='liblinear')
classifier.fit(x, y)

#Testing:
test_x = testing[reduced_set['x']]
test_y = testing['Class']

predic_y = classifier.predict(test_x)

assertion = [test_y[n] == item for n, item in enumerate(predic_y)]

print('Logistic Regression:')

corrects = assertion.count(True)
wrongs = assertion.count(False)
示例#3
0
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

np.set_printoptions(suppress=True)

train_df = pd.read_csv("train.csv")
y = train_df["author"]
logreg = linear_model.LogisticRegression(C=1e5)

# work out how many tokens we should keep
token_range = [
    10, 50, 75, 100, 300, 500, 750, 1000, 1250, 1500, 1750, 2000, 2500, 3000
]
token_scores = []
for tokens in token_range:
    vectorizer = TfidfVectorizer(max_df=0.95,
                                 min_df=2,
                                 max_features=tokens,
                                 stop_words='english')
    X = vectorizer.fit_transform(train_df["text"])
    scores = cross_val_score(logreg, X, y, cv=10, scoring='neg_log_loss')
    token_scores.append(abs(scores.mean()))

import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

plt.plot(token_range, token_scores)
示例#4
0
for file_name in file_list:
    df = pd.read_table(file_path + file_name, index_col=0)
    target = 'Disease'
    cells = df.columns.tolist()
    train_score_dict = {}
    test_score_dict, test_fscore_dict = {}, {}
    for i in range(0, len(cells) - 1):
        # for i in range(0, 5):
        features = cells[i:i + 1]  # can use for loop to do one cell iteration
        print(''.join(features))
        train_df, test_df = train_test_split(df,
                                             train_size=0.8,
                                             random_state=1)
        # print(train_df.shape, test_df.shape)
        regularization = 1.0  # 1e5
        logreg = linear_model.LogisticRegression(C=regularization)
        logreg.fit(train_df[features], train_df[target])
        preds = logreg.predict_proba(test_df[features])
        preds_1 = preds[:, 1]
        cell = ''.join(features)
        fpr[cell], tpr[cell], _ = metrics.roc_curve(test_df[target], preds_1)
        roc_auc[cell] = metrics.auc(fpr[cell], tpr[cell])

        p_list = logreg.predict(test_df[features]).tolist()
        t_list = test_df[target].tolist()
        print(p_list, t_list, sep="\n")
        score_Train = logreg.score(train_df[features], train_df[target])
        score_Test = logreg.score(test_df[features], test_df[target])
        # scoreTestA = metrics.accuracy_score(t_list,p_list)   # fraction of correctly classified samples
        scoreTestF = metrics.f1_score(t_list, p_list)
        print(features, score_Train, score_Test,
ridge_classifier.fit(train_data, train_lables)

# Use the classifier:

ridge_prediction = ridge_classifier.predict(test_data)

# Quality control:

metrics.accuracy_score(test_labels,ridge_prediction) # 0.8666666666666667  good!
print(ridge_classifier.coef_) # weights: [[-0.0854443  -0.07273219]]
print(ridge_classifier.intercept_) # coef before free member [-0.31250723]
plt.show()

# LogisticRegression:

log_regressor = linear_model.LogisticRegression(random_state=1)
log_regressor.fit(train_data,train_lables)

lr_predictions = log_regressor.predict(test_data)
lr_predictions_proba = log_regressor.predict_proba(test_data) # probability of prediction

# print(test_labels)
# print(lr_predictions)
# print(lr_predictions_proba)

accuracy_score = metrics.accuracy_score(test_labels,lr_predictions) # 0.8

# Quality control on cross-validation:

ridge_scoring1 = cross_val_score(ridge_classifier, blobs[0], blobs[1], scoring="accuracy", cv = 10)
lr_scoring = cross_val_score(log_regressor, blobs[0], blobs[1], scoring="accuracy", cv = 10)
示例#6
0
        print "PARAMS:"
        print "  targets - target labels which are as positive, in \"s0,s1,s2,...\" format"
        print "  config_file - config file of learning & architecture parameters"
        print "  patts_file - training patterns file in metis format"
        print "  model_file - out model file as \"Perceptron\" in metis format"
        print ""
        exit(-1)

    learn_params = MetisParams.LearnParams()
    arch_params = MetisParams.ArchParams()

    targets = map(lambda x: int(x), sys.argv[1].split(","))
    learn_params.readFromConfig(sys.argv[2])

    clf = linear_model.LogisticRegression(penalty=learn_params._regula,
                                          C=learn_params._alpha,
                                          max_iter=learn_params._max_iter,
                                          tol=learn_params._epsilon)

    # read patterns from file
    time0 = datetime.datetime.now()
    X_train, Y_train = MetisParams.readPatts(sys.argv[3], targets)
    arch_params._input = len(X_train[0])
    arch_params._output = 2
    print "Load %d patterns from %s" % (len(Y_train), sys.argv[3])

    # train model
    time1 = datetime.datetime.now()
    clf.fit(X_train, Y_train)
    print "Training Completed, number of iterations is %d" % clf.n_iter_

    # save model
                                        test_data[:, 0] - 1)) / float(num_test)
        ctr += 1

    dw[ind] = 0
    if mode == 1:
        ind = np.argmax(loss_change)
    elif mode == 2:
        ind = np.random.random_integers(0, 41)

print 'w = ', w
print 'loss = ', loss(w, train_data, reg_term)
print 'accuracy = ', accuracy[ctr - 1]

regr = linear_model.LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial',
                                       C=reg_term,
                                       tol=1e-8,
                                       max_iter=num_iterations)
regr.fit(train_data[:, 1:14], train_data[:, 0])
w_regr = np.column_stack((regr.coef_, regr.intercept_)).reshape(-1)
print 'w_regr = ', w_regr
print 'regr_loss = ', loss(w_regr, train_data, reg_term)
print 'regr_accuracy = ', regr.score(test_data[:, 1:14], test_data[:, 0])

if mode == 1:
    np.save('losses_custom.npy', loss1)
    np.save('accuracy_custom.npy', accuracy)
elif mode == 2:
    np.save('losses_random.npy', loss1)
    np.save('accuracy_random.npy', accuracy)
示例#8
0
preds = KNN_clf.predict(test)
print(preds)
print(accuracy_score(test_labels, preds))

# rss=((X-y)**2).sum()
# mse=np.mean((X-y)**2)
# print("Final rmse value is =",np.sqrt(np.mean((X-y)**2)))

AB_clf = AdaBoostClassifier(n_estimators=300, random_state=2)
model = AB_clf.fit(train, train_labels)
preds = AB_clf.predict(test)
print(preds)
print(accuracy_score(test_labels, preds))

LOG_clf = linear_model.LogisticRegression(multi_class="ovr",
                                          solver="sag",
                                          class_weight='balanced')
model = LOG_clf.fit(train, train_labels)
preds = LOG_clf.predict(test)
print(preds)
print(accuracy_score(test_labels, preds))

R_F = RandomForestClassifier(n_estimators=300, max_depth=3, random_state=2)
model = R_F.fit(train, train_labels)
preds = R_F.predict(test)
print(preds)
print(accuracy_score(test_labels, preds))
scores = []
num_features = len(train.columns)
for i in range(num_features):
    col = train.columns[i]
示例#9
0
def get_sparse_classifiers(ncv_set, cvp_set):
    return {
        'L1 Logistic Regression':
        ({
            'proc__C': [
                0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.8, 1, 1.5, 2, 3, 5, 10, 15,
                20, 25, 50
            ],
            'severity': [True, False]
        },
         BinClassifier(
             linear_model.LogisticRegression(penalty='l1',
                                             class_weight='balanced')),
         ncv_set, True, 'proc__C'),
        'L2 Logistic Regression': ({
            'proc__C': [
                0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.8, 1, 1.5, 2, 3, 5, 10, 15,
                20, 25, 50
            ],
            'severity': [True, False]
        },
                                   BinClassifier(
                                       linear_model.LogisticRegression(
                                           penalty='l2',
                                           class_weight='balanced')), ncv_set,
                                   True, 'proc__C'),
        'Lasso': ({
            'reg__alpha': [2, 1.5, 1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.01],
            'severity': [True, False],
            'thres': [0, 0.2, 0.75, 0.8, 0.85, 0.9, 0.95]
        }, RegClassifier(linear_model.Lasso()), cvp_set, True, 'reg__alpha'),
        'Linear Regression': ({
            'severity': [True, False],
            'thres': [0]
        }, RegClassifier(linear_model.LinearRegression()), cvp_set, True,
                              'thres'),
        'Ridge': ({
            'reg__alpha': [2, 1.5, 1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.01],
            'severity': [True, False],
            'thres': [0, 0.2, 0.75, 0.8, 0.85, 0.9, 0.95]
        }, RegClassifier(linear_model.Ridge()), cvp_set, True, 'reg__alpha'),
        'Relaxed Lasso': ({
            'reg__alpha': [0.005, 0.01, 0.1, 0.2, 0.5],
            'severity': [True, False],
            'thres': [0, 0.4, 0.6, 0.75],
            'first_reg__reg__alpha': [1, 0.5, 0.2, 0.1, 0.01],
            'first_reg__severity': [False],
            'first_reg__thres': [0, 0.2, 0.75]
        },
                          RelaxedLinear(first_reg=RegClassifier(
                              reg=linear_model.Lasso()),
                                        reg=linear_model.Lasso()), cvp_set,
                          True, 'first_reg__reg__alpha'),
        'Elastic Net': ({
            'reg__alpha': [2, 1.5, 1, 0.8, 0.7, 0.5, 0.3, 0.2, 0.1, 0.01],
            'severity': [True, False],
            'thres': [0, 0.75, 0.85, 0.9, 0.95],
            'reg__l1_ratio': [0.2, 0.4, 0.5, 0.6, 0.7, 0.8]
        }, RegClassifier(linear_model.ElasticNet()), cvp_set, True,
                        'reg__alpha'),
        'Shrunken Centroids OCV': ({
            'proc__shrink_threshold':
            [2, 1.5, 1.3, 1.2, 1.1, 1, 0.8, 0.5, 0.1, 0.01],
            'proc__metric': ['euclidean', 'manhattan', 'cosine'],
            'severity': [False]
        }, BinClassifier(ShrunkenCentroidClassifier()), ncv_set, True,
                                   'proc__shrink_threshold'),
        'Shrunken Centroids UCV': ({
            'proc__shrink_threshold':
            [2, 1.5, 1.3, 1.2, 1.1, 1, 0.8, 0.5, 0.1, 0.01],
            'proc__metric': ['euclidean', 'manhattan', 'cosine'],
            'severity': [False]
        }, BinClassifier(ShrunkenCentroidClassifier()), cvp_set, True,
                                   'proc__shrink_threshold'),
        'SVM enet': ({
            'proc__loss': ['modified_huber'],
            'proc__alpha': [2, 1, 0.5, 0.4, 0.2, 0.1, 0.05],
            'proc__l1_ratio': [0.1, 0.2, 0.3, 0.5]
        },
                     BinClassifier(
                         linear_model.SGDClassifier(penalty='elasticnet',
                                                    class_weight='balanced')),
                     ncv_set, True, 'proc__alpha'),
        'L1 Linear SVM': ({
            'proc__C':
            [0.0000001, 0.001, 0.01, 0.5, 0.7, 0.1, 0.2, 0.5, 1, 1.5, 2]
        },
                          BinClassifier(
                              svm.LinearSVC(penalty='l1',
                                            dual=False,
                                            class_weight='balanced')), ncv_set,
                          True, 'proc__C')
    }
示例#10
0
 def solver(self, solver):
     self.__solver = solver
     self.classifier = linear_model.LogisticRegression(
         solver=self.__solver, C=self.__C, multi_class=self.__multi_class)
示例#11
0
 def multi_class(self, multi_class):
     self.__multi_class = multi_class
     self.classifier = linear_model.LogisticRegression(
         solver=self.__solver, C=self.__C, multi_class=self.__multi_class)
qeds.themes.mpl_style()

# Logistic Regression

data_url = "https://raw.githubusercontent.com/propublica/compas-analysis"
data_url += "/master/compas-scores-two-years.csv"

df = pd.read_csv(data_url)
df.head()

X = df[["decile_score"]]
y = df["two_year_recid"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.25)

logistic_model = linear_model.LogisticRegression(solver="lbfgs")
logistic_model.fit(X_train, y_train)

beta_0 = logistic_model.intercept_[0]
beta_1 = logistic_model.coef_[0][0]

print(f"Fit model: p(recid) = L({beta_0:.4f} + {beta_1:.4f} decile_score)")

# Decesion boundaries

X = df[["decile_score", "age"]]
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.25, random_state=42)

logistic_age_model = linear_model.LogisticRegression(solver="lbfgs")
logistic_age_model.fit(X_train, y_train)
示例#13
0
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 2, 3, 4, 5, 10],
    'solver': ['liblinear'],
    'max_iter': [100, 500],
    'tol': [0.0001, 0.00001, 0.00001],
    'class_weight': [None, 'balanced']
}
#survived = y[y == 1]
#isCabin_surv = data.iloc[survived.index]['Cabin'].map(lambda x: 0 if x != x else 1)
#n_survived = y[y == 0]
#isCabin_n_surv = data.iloc[n_survived.index]['Cabin'].map(lambda x: 0 if x != x else 1)
#colors = ['blue','green']
#plt.hist([isCabin_surv, isCabin_n_surv],  histtype='bar', color=colors, stacked=True, fill=True, label = ['survived', 'not survived'])
#plt.legend()

linear_model_titanic = linear_model.LogisticRegression(random_state=0)
linear_search_res = ms.GridSearchCV(linear_model_titanic, param_log)
linear_model_titanic.fit(whole_train_matrix, y)
linear_search_res.fit(whole_train_matrix, y)
#predictions = linear_search_res.best_estimator_.predict_proba(whole_test_matrix)
#predictions = [1 if prediction[1] > 0.5 else 0 for prediction in predictions]
score_logistic = ms.cross_val_score(linear_model_titanic,
                                    whole_train_matrix,
                                    y,
                                    cv=3).mean()
score_search_logistic = ms.cross_val_score(linear_search_res.best_estimator_,
                                           whole_train_matrix,
                                           y,
                                           cv=3).mean()
#answer = pd.DataFrame()
#answer['PassengerId'] = test_data['PassengerId']
示例#14
0
print(train.columns)

sns.countplot(x='target', data=train)

#filter unique value features
train1 = train.iloc[:, 2:]
y = train['target'].astype(int)

X_train, X_eval, y_train, y_eval = model_selection.train_test_split(
    train1, y, test_size=0.1, random_state=1)

stages = [('imputer', preprocessing.Imputer()),
          ('zv_filter', feature_selection.VarianceThreshold()),
          ('feature_selector',
           feature_selection.RFE(svm.LinearSVC(max_iter=10000))),
          ('classifier', linear_model.LogisticRegression())]
pipeline = pipeline.Pipeline(stages)
pipeline_grid = {
    'feature_selector__n_features_to_select': [10, 20],
    'classifier__C': [0.001, 0.01, 0.1, 0.2, 0.5],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__class_weight': ['balanced', None]
}
pipeline_generated = cutils.grid_search_best_model(pipeline,
                                                   pipeline_grid,
                                                   X_train,
                                                   y_train,
                                                   scoring="roc_auc")
final_estimator = pipeline_generated.named_steps['classifier']
print(pipeline_generated.score(X_eval, y_eval))
示例#15
0
import numpy as np
from sklearn import linear_model
from sklearn.metrics import accuracy_score as acc
from utils.mnist_loader import load_mnist
np.random.seed(21)

## DATA
X_path = 'mnist/t10k-images-idx3-ubyte'
y_path = 'mnist/t10k-labels-idx1-ubyte'
X, y = load_mnist(X_path, y_path)

## FEATURE
id_shuffle = np.random.permutation(len(y))
id_trn = id_shuffle[:6000]
id_tst = id_shuffle[-4000:]

X = X.reshape(X.shape[0],-1)/127.5 - 1.
X_trn, y_trn = X[id_trn], y[id_trn]
X_tst, y_tst = X[id_tst], y[id_tst]

print('train: ', X_trn.shape, y_trn.shape)
print('test: ', X_tst.shape, y_tst.shape)

## MODEL
model = linear_model.LogisticRegression(C=1e5,
        solver='liblinear', multi_class='ovr')
model.fit(X_trn, y_trn)

y_prd = model.predict(X_tst)
accuracy = acc(y_tst, y_prd)
print('accuracy: ', accuracy)
 def __init__(self):
     """Initializes the classifier."""
     self.clf = linear_model.LogisticRegression(class_weight='balanced')
示例#17
0
 def get_classifier(self):
     """ returns the classifier"""
     log_reg = linear_model.LogisticRegression(solver='lbfgs')
     ors = OneVsRestClassifier(log_reg)
     return ors
示例#18
0
def fitL1LogisticWithNFeat(**kwargs):
    fitter = linear_model.LogisticRegression(penalty="l1", C=1, solver="saga")
    return fitModelWithNFeat(fitter=fitter, **kwargs)
示例#19
0
print df
my_tags = ['m', 'f']

print df.gender.value_counts()
train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)
print(len(test_data), len(train_data))

print "\n---BAG OF WORDS---"
count_vectorizer = CountVectorizer(analyzer="word",
                                   tokenizer=nltk.word_tokenize,
                                   preprocessor=None,
                                   stop_words='english',
                                   max_features=4000)

train_data_features = count_vectorizer.fit_transform(train_data['text'])
logreg_model = linear_model.LogisticRegression(n_jobs=1, C=1e5)
logreg_model = logreg_model.fit(train_data_features, train_data['gender'])
print word_embeddings.predict(count_vectorizer, logreg_model, test_data,
                              my_tags)

print "\n---N-GRAMS---"
n_gram_vectorizer = CountVectorizer(analyzer="char",
                                    ngram_range=([2, 5]),
                                    tokenizer=None,
                                    preprocessor=None,
                                    max_features=4000)

charn_model = linear_model.LogisticRegression(n_jobs=1, C=1e5)

train_data_features = n_gram_vectorizer.fit_transform(train_data['text'])
示例#20
0
def fitLogisticWithNFeat(**kwargs):
    fitter = linear_model.LogisticRegression(penalty="l2", C=1e10)
    return fitModelWithNFeat(fitter=fitter, **kwargs)
示例#21
0
def get_logistic_regr_score(X_train, Y_train, X_test, Y_test):
    log_regr = linear_model.LogisticRegression(solver='newton-cg',
                                               random_state=42)
    log_regr.fit(X_train, Y_train)
    return log_regr.score(X_test, Y_test)
示例#22
0
# Loads pandas
import pandas
# Loads numpy
import numpy as np
from xgboost import XGBClassifier
dataframe = pandas.read_csv("USDJPY,5multiclass.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
#X = dataset[:,0:4050].astype(float)
X = dataset[:,0:59]
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
y = dataset[:,59]
clf0 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=200)
clf1 = lm.LogisticRegression(penalty = "l1", C = 9081)
clf2 = RandomForestClassifier(random_state=1, n_estimators=200)
clf3 = lm.LogisticRegression(penalty = "l2", C = 5000)
clf4 = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
eclf = EnsembleVoteClassifier(clfs=[clf0, clf1, clf2, clf3, clf4], weights=[1,1,1,3,1])

labels = ['GBC','Lasso', 'Random Forest', 'Ridge', 'MLP','Ensemble']
for clf, label in zip([clf0, clf1, clf2, clf3, clf4, eclf], labels):

    scores = model_selection.cross_val_score(clf, X, y, 
def prediction_step(background_train, background_test, job_training_data, challengeID_train):
	
	# We apply transform to both the training and test set
	#background_train_np = enc.transform(background_train_np)
	#background_test_np = enc.transform(background_test_np)

	# Convert the background training and testing to numpy arrays
	background_train_np = background_train.as_matrix()
	background_train_np = np.asmatrix(background_train_np)

	background_test_np = background_test.as_matrix()
	background_test_np = np.asmatrix(background_test_np)

	# Convert the job_training data into matrix and then into a 1-D array
	job_training_data_np = job_training_data.as_matrix()
	job_training_data_np = np.asmatrix(job_training_data_np)
	job_training_data_np = np.ravel(job_training_data_np)


	# Perform fecture selection to reduce the number of
	# required features
	#background_train_np, background_test_np = select_feature(background_train_np, background_test_np, job_training_data_np)

	# Select k-best features
	#background_train_np, background_test_np = select_k_best(background_train_np, background_test_np, job_training_data_np)

	# Perform principal component analysis
	#background_train_np, background_test_np = perform_pca(background_train_np, background_test_np, job_training_data_np)

	# Perform principal random tree embedding
	# predict_job_training = perform_one_hotencoding(background_train_np, background_test_np, job_training_data_np)

	# Perform Cross Validation
	# Choose the method to perform the actual prediction using the best performing
	# scheme
	position = cross_validate_model(background_train_np, job_training_data_np)

	####################################################
	## Set up the same methods used in cross validation
	## Fitting twice gives an error hence this way
	####################################################
	# List the regression methods to use.
	clf_quaddis = discriminant_analysis.QuadraticDiscriminantAnalysis()
	clf_logreg = sklinear.LogisticRegression(penalty='l1')
	clf_random_forest = ensemble.RandomForestClassifier(n_estimators=50)
	clf_adaboost = ensemble.AdaBoostClassifier(n_estimators = 50)
	clf_mlpc = neural_network.MLPClassifier()
	clf_extra_tree = ensemble.ExtraTreesClassifier(n_estimators=50, bootstrap=True)

	# Add the above methods in an array
	# More ameable for looping
	methods = [clf_quaddis, clf_logreg, clf_random_forest, clf_adaboost, clf_mlpc, clf_extra_tree]
	methods_label = ['clf_quaddis', 'clf_logreg', 'clf_random_forest', 'clf_adaboost', 'clf_mlpc', 'clf_extra_tree']

	method = methods[position]
	method_label = methods_label[position]

	print('The chosen method is : %s' %(method_label))

	# Predict based on the chosen method
	method.fit(background_train_np, job_training_data_np)
	predict_job_training = method.predict_proba(background_test_np)
	filename = 'predict_job_training_'+method_label+'.csv'
	if os.path.isfile(filename) :
		os.remove(filename)

	for i in range(len(predict_job_training)):
		file = open(filename,"a+")
		file.write("%f \r\n" % (predict_job_training[i,1]))

	file.close()
示例#24
0
    def run(self):
        data = OrderedDict()
        shapes = {}
        for r in self.requires():
            x = r.load().squeeze()
            data[r.task_id] = x
            shapes[r.task_id] = x.shape[1] if len(x.shape) == 2 else 1

        data = pandas.DataFrame(data)[list(data.keys())]

        data['is_duplicate'] = Dataset().load()[1].is_duplicate
        X = data.drop('is_duplicate', 1).values
        print(X.max(), X.min(), np.isnan(X).sum())
        y = data.is_duplicate.values
        np.savetxt('cache/Ry.csv',
                   data.is_duplicate,
                   header='is_duplicate',
                   delimiter=',')

        weights = core.weights[y]
        scores = []
        cls = linear_model.LogisticRegression(C=10)
        cls.fit(X, y)
        print(pandas.Series(cls.coef_[0],
                            data.drop('is_duplicate', 1).columns))

        polytransform = preprocessing.PolynomialFeatures(2)
        scaletransform = preprocessing.Normalizer()
        transform = pipeline.Pipeline([('scale', scaletransform),
                                       ('poly', polytransform)])

        for train_index, test_index in model_selection.KFold(
                n_splits=10).split(X, y):
            cls = linear_model.LogisticRegression(C=10)
            #cls = TorchLogit()
            X_train, X_test = X[train_index], X[test_index]
            X_train = transform.fit_transform(X_train)
            X_test = transform.transform(X_test)

            y_train, y_test = y[train_index], y[test_index]
            w_train, w_test = weights[train_index], weights[test_index]
            cls.fit(X_train.copy(), y_train.copy())  #, sample_weight=w_train)
            pred = cls.predict_proba(X_test)
            score = metrics.log_loss(y_test, pred, sample_weight=w_test)
            print(score)
            scores.append(score)
        print(colors.yellow | '!----++++++----!')
        print(colors.yellow | colors.bold | '|' + str(np.mean(scores)) + '|')
        print(colors.yellow | '¡----++++++----¡')

        X = transform.transform(X)
        cls.fit(X, y, sample_weight=weights)

        data = OrderedDict()
        for r in self.requires():
            x = r.load_test().squeeze()
            data[r.task_id] = x
            assert shapes[r.task_id] == x.shape[1] if len(x.shape) == 2 else 1,\
                "Shape: {} did not match expected {}" % (x.shape, shapes[r.task_id])
            #print(r.__class__.__name__, '\t', x.shape, type(x))
        data = pandas.DataFrame.from_dict(data)

        X = data.values
        X = transform.transform(X)
        index = pandas.Index(np.arange(X.shape[0]), name='test_id')
        pred = pandas.Series(cls.predict_proba(X)[:, 1],
                             index=index,
                             name='is_duplicate').to_frame()
        print(colors.green | str(pred.head()))

        with gzip.open('cache/stacked_pred.csv.gz.tmp', 'wt') as f:
            pred.to_csv(f)
        os.rename('cache/stacked_pred.csv.gz.tmp', 'cache/stacked_pred.csv.gz')
示例#25
0
def ml(dfin):
    """ Runs either logistic regression or KNN through sklearn using both 
        dev and test set. The hyperparameter is set using a dev set. Post
        hyperparameter training uses train and dev set for training.
        
        :df     -pandas df - must have a 'y' column that is bool, all other
                 columns will be cast to float and used as features.        
        :model  -'LOGISTIC' or 'KNN'
        """

    df = pd.DataFrame(dfin, dtype=float)
    assert ('y' in df.columns)
    df = df.dropna()

    if df.shape[0] < 20:
        N_FOLDS = 2
    else:
        N_FOLDS = 10
    ACC_THRESH = 0.01  # dev set accuracy must be x% better to use new param

    models = ['KNN', 'LOGISTIC', 'TREE']
    for model in models:
        print('\nMODEL: ', model)

        if model == 'LOGISTIC':
            c_l = [
                0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000, 10000
            ]
        elif model == 'KNN':
            c_l = [50, 40, 35, 30, 25, 20, 18, 15]
        else:
            c_l = [3, 4, 5, 6, 7, 8]
        regularizer = 'l1'

        X_nd = df.drop('y', axis=1).values
        #X_nd = scale(X_nd) # magnitude has useful info?
        y_n = df['y'].values.astype(bool)
        skf = StratifiedKFold(shuffle=True, n_splits=N_FOLDS)

        acc_test_a = np.zeros(N_FOLDS)
        acc_train_a = np.zeros(N_FOLDS)
        for i, (train, test) in enumerate(skf.split(X_nd, y_n)):
            train_n = len(train)
            dev = train[:int(train_n /
                             4)]  # empirically found that dev 1/4 is good
            sub_train = train[int(train_n / 4):]  # this is temporary train set
            best_acc = 0
            best_c = None
            # in this loop we find best hyper parameter for this split
            for c in c_l:
                if model == 'LOGISTIC':
                    clf = linear_model.LogisticRegression(penalty=regularizer,
                                                          C=c)
                elif model == 'KNN':
                    clf = KNeighborsClassifier(n_neighbors=c,
                                               metric='euclidean',
                                               weights='uniform')
                else:
                    clf = tree.DecisionTreeClassifier(max_leaf_nodes=c)
                clf.fit(X_nd[sub_train], y_n[sub_train])
                y_pred = clf.predict(X_nd[dev])
                acc = metrics.accuracy_score(y_pred, y_n[dev])
                if (acc > best_acc + ACC_THRESH):
                    best_acc = acc
                    best_c = c

            # retrain with all train data and best_c
            print('fold:',
                  i,
                  ' best c:',
                  best_c,
                  ' dev:%.2f' % best_acc,
                  ' dev_ones:%.2f' % (y_n[dev].sum() / len(dev)),
                  end='')
            if model == 'LOGISTIC':
                clf = linear_model.LogisticRegression(penalty=regularizer,
                                                      C=best_c)
            elif model == 'KNN':
                clf = KNeighborsClassifier(n_neighbors=best_c,
                                           metric='euclidean',
                                           weights='uniform')
            else:
                clf = tree.DecisionTreeClassifier(max_leaf_nodes=best_c)
            clf.fit(X_nd[train], y_n[train])
            y_pred = clf.predict(X_nd)
            acc_test_a[i] = metrics.accuracy_score(y_pred[test], y_n[test])
            acc_train_a[i] = metrics.accuracy_score(y_pred[train], y_n[train])
            print(' test:%.2f' % acc_test_a[i], ' train:%.2f' % acc_train_a[i])
        print('Avg test acc:%.3f' % acc_test_a.mean(),
              'Avg train acc:%.3f' % acc_train_a.mean())
示例#26
0
#        with torch.no_grad():
#            loss, acc = validation(model, test_dataset, criterion)
#            print("Loss ", loss, "Accuracy", acc)
model = Network(1470, [400], 1)
batch_size = 60
epoch = 3
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.02)
df = pd.read_csv('Features.csv')
feat = df.drop(df.columns[[0, -1]], axis=1, inplace=False)
Xd = feat.values.astype(float)
Y = df['Class'].values
Yd = (Y == 'M').astype(float)
m1 = sksvm.SVC()
m2 = ske.RandomForestClassifier()
m3 = skl.LogisticRegression()

svmScore = 0
rfScore = 0
logScore = 0
nnscore = 0
for i in range(10):
    X_train, X_test, Y_train, Y_test = skm.train_test_split(Xd,
                                                            Yd,
                                                            test_size=0.3)
    trainset = data(X_train, Y_train, batch_size)
    model = Network(1470, [400], 1)
    batch_size = 60
    epoch = 3
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.02)
# (4) 特征工程 - 归一化
# 接下来我们要接着做一些数据预处理的工作,比如scaling,将一些变化幅度较大的特征化到[-1,1]之内
# 这样可以加速logistic regression的收敛
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)

# (5) 特征工程 - 特征抽取
# 我们把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模
train_df = df.filter(
    regex=
    'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
)
#train_df.to_csv("processed_titanic.csv" , encoding = "utf-8")
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]

# (6) 模型构建与训练
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

# (7) 绘制learning curve
plot_learning_curve(clf, u"学习曲线", X, y)
from sklearn.neural_networks import BernoulliRBM
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model

#constants
TRAINING = 'train.csv'
TEST = 'test.csv'


def read_in_csv(filename):
    data = []
    with open(filename, 'rb') as input_data:
        filereader = csv.reader(input_data, delimiter=',')
        for row in filereader:
            data.append(row)

    return data


train_examples = read_in_csv(TRAINING)
train_examples.pop(0)  #get rid of column headers
train_labels = []
#extract labels
for row in train_examples:
    train_labels.append(row.pop(0))

#models we will use
linear_classifier = linear_model.LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)
示例#29
0
from sklearn import datasets, neighbors, linear_model

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

n_samples = int(len(X_digits))

X_train = X_digits[:.9 * n_samples]
y_train = y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
y_test = y_digits[.9 * n_samples:]

knn = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression()

print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression score: %f' %
      logistic.fit(X_train, y_train).score(X_test, y_test))
示例#30
0
def runIrisFlowersTool():

    print("\nIris Flowers with tool\n")

    inputs, outputs = readDataTool()
    inputTrain, outputTrain, inputTest, outputTest = splitData(inputs, outputs)

    # normalise the dates
    scaler = StandardScaler()
    if not isinstance(inputTrain[0], list):
        inputTrain = [[d] for d in inputTrain]
        inputTest = [[d] for d in inputTest]

        scaler.fit(inputTrain)
        normalisedTrainInput = scaler.transform(inputTrain)
        normalisedTestInput = scaler.transform(inputTest)

        # decode from list
        normalisedTrainInput = [el[0] for el in normalisedTrainInput]
        normalisedTestInput = [el[0] for el in normalisedTestInput]

    else:
        scaler.fit(inputTrain)
        normalisedTrainInput = scaler.transform(inputTrain)
        normalisedTestInput = scaler.transform(inputTest)

    # normalised data: normalisedTrainInput, normalisedTestInput

    logisticRegressionTool = linear_model.LogisticRegression(max_iter=1000)
    logisticRegressionTool.fit(normalisedTrainInput, outputTrain)

    w0, w1, w2, w3, w4 = logisticRegressionTool.intercept_[0], logisticRegressionTool.coef_[0][0], \
                         logisticRegressionTool.coef_[0][1], logisticRegressionTool.coef_[0][2], \
                         logisticRegressionTool.coef_[0][3]

    print('Model SETOSA:  w0 = ', w0, ' w1 = ', w1, ' w2 = ', w2, ' w3 = ', w3,
          ' w4 = ', w4)

    w0, w1, w2, w3, w4 = logisticRegressionTool.intercept_[1], logisticRegressionTool.coef_[1][0], \
                         logisticRegressionTool.coef_[1][1], logisticRegressionTool.coef_[1][2], \
                         logisticRegressionTool.coef_[1][3]

    print('Model VERSICOLOR:  w0 = ', w0, ' w1 = ', w1, ' w2 = ', w2, ' w3 = ',
          w3, ' w4 = ', w4)
    w0, w1, w2, w3, w4 = logisticRegressionTool.intercept_[2], logisticRegressionTool.coef_[2][0], \
                         logisticRegressionTool.coef_[2][1], logisticRegressionTool.coef_[2][2], \
                         logisticRegressionTool.coef_[2][3]

    print('Model VIRGINICA:  w0 = ', w0, ' w1 = ', w1, ' w2 = ', w2, ' w3 = ',
          w3, ' w4 = ', w4)

    print()
    print('Prediction (tool): ',
          logisticRegressionTool.predict(normalisedTestInput))
    print(
        "Accuracy (tool): ",
        accuracy_score(outputTest,
                       logisticRegressionTool.predict(normalisedTestInput)))
    error = 1 - accuracy_score(
        outputTest, logisticRegressionTool.predict(normalisedTestInput))
    print("Classification Error (tool): ", error)