示例#1
0
skdnnBO = BayesianOptimization(skdnncv, {
    'h1': (10, 100),
    'h2': (10, 100),
    'learning_rate_init': (-5, -1)
})
skdnnBO.explore({
    'h1': [10, 100],
    'h2': [10, 100],
    'learning_rate_init': [-5, -1]
})
skdnnBO.maximize(init_points=10, n_iter=20)
print('SKDNN: %f' % skdnnBO.res['max']['max_val'])

#---- set classifiers to be combined for voting -------------

RF = RFC(n_estimators=int(rfBO.res['max']['max_params']['n_estimators']),
         max_features=int(rfBO.res['max']['max_params']['max_features']))

SVM = SVC(C=10**svcBO.res['max']['max_params']['C'],
          gamma=10**svcBO.res['max']['max_params']['gamma'],
          random_state=None,
          probability=True)

XGB = xgboost.XGBClassifier(
    learning_rate=10**xgbBO.res['max']['max_params']['learning_rate'],
    n_estimators=int(xgbBO.res['max']['max_params']['n_estimators']))

SKDNN = MLPClassifier(
    solver='adam',
    alpha=1e-5,
    batch_size='auto',
    hidden_layer_sizes=(int(skdnnBO.res['max']['max_params']['h1']),
        for j in range(i + 1, len(data.keys())):
            if chronology[i] > chronology[j]:
                chronology[i], chronology[j] = chronology[j], chronology[i]

    date = [chronology[len(chronology) * i // 6-1] for i in range(1,7)]

    del data, label
    gc.collect()

    clf_option = [
        Boosting(),
        LR(n_jobs = -1),
        NB(),
        LinearSVC(),
        Neighbors(),
        RFC()
    ]

    mre_pred = []

    for iter in tqdm(range(5)):
        if settings.DEBUG_MODE:
            print("Memulai pengambilan data")

        mre_total = []
        query = "Select * from berita WHERE Date <= "+str(date[iter])
        c.execute(query)
        train_data = c.fetchall()

        query = "Select * from berita WHERE Date <= "+str(date[iter+1])+" AND "+str(date[iter])
        c.execute(query)
示例#3
0
# generate random features
wx = rp(333,[0.2,2,20],1)
wy = rp(333,[0.2,2,20],1)
wz = rp(333,[0.2,2,20],2)

# generate training data
print 'generating training data...'
(x,y) = pairset(10000)
#(x_te,y_te,m_te) = tuebingen()

# load test data
(pairs,num_features,num_pairs) = boston_housing()

# train the classifier and predict the test data
print 'training the random forest classifier...'
reg = RFC(n_estimators=100,random_state=0,n_jobs=4).fit(x,y);
y_prob = reg.predict_proba(pairs)

# save the predictive probability of pairs
np.savetxt('housing_predict.txt',y_prob,fmt='%.5f')

# visualize the directed graph
node_labels = ["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT","MEDV"]

desp = """ 
1. CRIM      per capita crime rate by town
2. ZN        proportion of residential land zoned for lots over 
             25,000 sq.ft.
3. INDUS     proportion of non-retail business acres per town
4. CHAS      Charles River dummy variable (= 1 if tract bounds 
             river; 0 otherwise)
scaler = RobustScaler()  #second case using this scaler
x_scaled = scaler.fit_transform(x_train)

x_new = pd.DataFrame(x_scaled, columns=x.columns)
x_new.head()

skpca = PCA(n_components=55)
x_pca = skpca.fit_transform(x_new)
print('Variance sum : ', skpca.explained_variance_ratio_.cumsum()[-1])

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import classification_report, confusion_matrix

model = RFC(n_estimators=100,
            random_state=0,
            oob_score=True,
            max_depth=30,
            max_features='sqrt')
model.fit(x_pca, y_train)

x_test_scaled = scaler.transform(x_test)
x_test_new = pd.DataFrame(x_test_scaled, columns=x.columns)
x_test_pca = skpca.transform(x_test_new)

y_pred = model.predict(x_test_pca)
print(classification_report(y_pred, y_test))

sns.heatmap(confusion_matrix(y_pred, y_test),
            annot=True,
            fmt="d",
            cmap=plt.cm.Accent,
示例#5
0
    def get_base_learners(self):
        self.m_randomForest = RFC(n_estimators=100,
                                  criterion='entropy',
                                  random_state=3)

        self.m_randomForest.fit(self.m_sourceDataFeature, self.m_sourceLabel)
示例#6
0
def main():
    cols = [
        'd_age', 'samerace', 'attractive_partner', 'interests_correlate',
        'like', 'guess_prob_liked', 'match', 'attractive', 'attractive_partner'
    ]
    df = pd.read_csv('raw/data.csv', usecols=cols)

    df = df.replace('?', np.nan)
    df = df.dropna()

    df = df.sample(frac=1)
    nrows = df.shape[0]

    label = df['match']
    df = df.drop(['match'], axis=1)

    n = int(0.8 * nrows)
    trainX = df.iloc[:n]
    testX = df.iloc[n:]

    trainY = label.iloc[:n]
    testY = label.iloc[n:]

    rf = RFC(n_estimators=10)
    X = trainX.values
    y = trainY.values
    rf.fit(X, y)

    Xtest = testX.values
    Ytest = testY.values

    predicted = rf.predict(Xtest)
    out = Ytest == predicted
    out = np.where(out == True)

    acc = len(out[0]) / Ytest.shape[0]
    print(acc)

    # write match test
    pd.DataFrame(testY.index).to_csv('psl/match_test.txt',
                                     header=False,
                                     sep='\t',
                                     index=False)

    # write match obs
    pd.DataFrame(trainY).to_csv('preprocessed/match_train.txt',
                                header=False,
                                sep='\t')

    # write rf predictions
    outdf = pd.DataFrame(predicted, index=testX.index, columns=['predicted'])
    outdf.to_csv('preprocessed/rf.txt', header=False, sep='\t')

    # write similarities

    out = []
    t = testX['attractive'].astype(
        np.float64) - testX['attractive_partner'].astype(np.float64)
    for i, r in t.items():
        for j, rr in t.items():
            sim = abs(r - rr)
            out.append(f'{i}\t{j}\t{sim}')
    s = '\n'.join(out)
    with open('preprocessed/sim.txt', 'w') as f:
        f.write(s)
"""
################################################################################

### your code here!  name your classifier object clf if you want the
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc = accuracy_score(pred, labels_test)
print "Gaussian  acc", acc

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score

clf = RFC()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc = accuracy_score(pred, labels_test)
print "defaule acc", acc

print "For different n_estimators:"
print "----------------------------"
max_acc, max_est = 0, 0
for i in [1, 2, 4, 8, 10, 16, 25, 50, 32, 64, 100,
          200]:  #For estimators in n_estimators
    clf = RFC(n_estimators=i, random_state=42)
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    acc = accuracy_score(pred, labels_test)
    if (acc > max_acc):
示例#8
0
lgr_parameters = {"penalty": ("l1", "l2"), "C": C_range}
sgd_parameters = {
    "loss": ("hinge", "log", "modified_huber", "squared_hinge", "perceptron",
             "squared_loss", "huber", "epsilon_insensitive",
             "squared_epsilon_insensitive"),
    "penalty": ("none", "l2", "l1", "elasticnet")
}
rfc_parameters = {"n_estimators": np.arange(50, 201, 10)}
efc_parameters = {}
# abc_parameters = {}
# gbc_parameters = {}

classifiers = [[LDA(), "LDA", lda_parameters], [SVC(), "SVC", svc_parameters],
               [LGR(), "LogReg", lgr_parameters],
               [SGD(), "StochGradDesc", sgd_parameters],
               [RFC(), "Random Forest", rfc_parameters],
               [EFC(), "Extra Tree", efc_parameters]]

# [KNN(), "KNearestNeighbor", knn_parameters],
# ,
#     [ABC(), "AdaBoost", abc_parameters],
#     [GBC(), "Gradient Boosting Classifier", gbc_parameters]

count = 0
clf_count = len(classifiers)
channels = data["X_train"].shape[1]

# T = Normalizer()

cv = ShuffleSplit(n_splits, test_size)
print()
print(df_wine.tail())
print()
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test =\
    T_T_S(X,y,test_size = .3, random_state = 0, stratify = y)
# stratify ensures same class proportions of training and test data sets
print('Training Data Size = ', len(X_train))
print('Test Data Size = ', len(X_test))
print()

pause()

feat_labels = df_wine.columns[1:]
forest = RFC(n_estimators=500, random_state=1)
forest.fit(X_train, y_train)

tic_fwd = time()
sfs_forward = SequentialFeatureSelector(forest,
                                        n_features_to_select=5,
                                        direction='forward').fit(
                                            X_train, y_train)
toc_fwd = time()

tic_bwd = time()
sfs_backward = SequentialFeatureSelector(forest,
                                         n_features_to_select=5,
                                         direction='backward').fit(
                                             X_train, y_train)
toc_bwd = time()
示例#10
0
print('Saving finished dataset')

finished_dataset = np.insert(instances,instances.shape[1],labels, axis=1)
with open('finished_dataset.csv', 'w') as output:
       writer = csv.writer(output, lineterminator='\n')
       for line in finished_dataset:
           writer.writerow(line) 
"""
"""
The parameter class_weight can penalise mistakes to the minority class
in order to mitigate the imbalance of the training dataset
include the argument probability=True if it is useful to enable 
probability estimates for SVM algorithms."""
svc_classifier = SVC()
naiveb_classifier = GaussianNB()
randomfor_classifier = RFC(criterion='entropy', n_jobs=2, n_estimators=100)
#solver parameter works better with lbfgs instead of adam (tested beforehand)
ann_classifier = MLPClassifier(solver='lbfgs')
"""scikit-learn uses an optimised version of the CART algorithm; however, 
scikit-learn implementation does not support categorical variables for now.
CART (Classification and Regression Trees) is very similar to C4.5, but it 
differs in that it supports numerical target variables (regression) and does 
not compute rule sets. CART constructs binary trees using the feature and 
threshold that yield the largest information gain at each node."""
tree_classifier = tree.DecisionTreeClassifier()
lr_classifier = LogisticRegression()

classify(svc_classifier, 'Support Vector Machines', smote=False)
classify(naiveb_classifier, 'Naive Bayes', smote=False)
classify(randomfor_classifier, 'Random Forest', smote=False)
classify(ann_classifier, 'Multi-layer Perceptron', smote=False)
示例#11
0
文件: main.py 项目: furyjack/ML
                'AgeFill'] = median_ages[i,j]

df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
df['FamilySize'] = df['SibSp'] + df['Parch']
df['Age*Class'] = df.AgeFill * df.Pclass
df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)
df = df.dropna()

train_data = df.values
label = train_data[:, 1]

df = df.drop(['Survived'], axis=1)
train_data = df.values

X_train, X_test, Y_train, Y_test = skc.train_test_split(train_data, label)
clf = RFC(n_estimators=60)
clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))

df1 = pd.read_csv('test.csv', header=0)
df1['Gender'] = df1['Sex'].map({'female': 0, 'male': 1}).astype(int)
#
df1['AgeFill'] = df1['Age']
median_ages = np.zeros((2, 3))
for i in range(0, 2):
    for j in range(0, 3):
        df1.loc[ (df1.Age.isnull()) & (df1.Gender == i) & (df1.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j]

df1['AgeIsNull'] = pd.isnull(df1.Age).astype(int)
df1['FamilySize'] = df1['SibSp'] + df1['Parch']
示例#12
0
target = df[['survival']]
from sklearn import cross_validation as cv
splits = cv.train_test_split(feat, target, test_size=0.2)
xtrain, xtest, ytrain, ytest = splits
expected = [2, 2, 1, 1]

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report as CSR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as RFC

model_svc = SVC()
model_rfc = RFC()
model_LogRegr = LogisticRegression()
model_LinRegr = LinearRegression()

model_LinRegr.fit(xtrain, np.ravel(ytrain))
model_LogRegr.fit(xtrain, np.ravel(ytrain))
model_svc.fit(xtrain, np.ravel(ytrain))
model_rfc.fit(xtrain, np.ravel(ytrain))

pd1 = pd.read_csv('/home/khany1/allcode/datasets/t1.txt',
                  header=None,
                  names=['aa', 'bb', 'cc'])
pr_linregr = model_LinRegr.predict(pd1)
pr_logregr = model_LogRegr.predict(pd1)
pr_svc = model_svc.predict(pd1)
pr_rfc = model_rfc.predict(pd1)
示例#13
0
def MultiTrAdap(Xs, Ys, Xa, Ya, Xt, Yt, nIters=200):
    # s for source domain, a for auxilary data, t for test data
    p = progressbar.ProgressBar()
    Xsa = np.concatenate((Xs, Xa))
    Ysa = np.concatenate((Ys, Ya))
    Ns = Ys.shape[0]
    Na = Ya.shape[0]

    Epss = []  # Epsilons in each iteration
    TestAcc = []  # Accuracy on test set in each iteration
    TestPrd = {}  # Predictions made in each iteration
    Weights = {}  # Sample weights in each iteration
    AdaAcc = [
    ]  # if only use the aux data with weights (Only adaboost), what will be the accuracy

    Beta = 1 / (1 + np.sqrt(2 * np.log(Ns) / nIters)
                )  # for updateting the source samples
    Wsa = np.ones(Ns + Na) / (Ns + Na)  # Init the weights evenly
    p.start(nIters)
    for ni in range(nIters):
        Weights[ni] = Wsa
        #---- update P, train and predict ----
        Psa = Wsa / np.sum(Wsa)
        clf = RFC(n_estimators=5, criterion='entropy', max_depth=2)
        #    clf = LRC(solver = 'liblinear',multi_class='ovr')
        #    clf = LinearSVC(multi_class='ovr')
        #    clf = TreeC(splitter='best',max_depth=3)
        # update the W
        if 0:
            # Update the weights alternatively: train on A, prd on S => update S; then train on S, prd on A, => update A
            clf.fit(Xa, Ya, sample_weight=Psa[-Na:])
            YsPrd = clf.predict(Xs)
            RorWs = 1 * (YsPrd != Ys)
            clf.fit(Xs, Ys, sample_weight=Psa[:Ns])
            YaPrd = clf.predict(Xa)
            RorWa = 1 * (YaPrd != Ya)
            RorW = np.concatenate((RorWs, RorWa))
        else:
            # Normal TrAdaBoost, Train on S&A, prd on S&A
            clf.fit(Xsa, Ysa, sample_weight=Psa)
            YsaPrd = clf.predict(Xsa)
            # calculate the accuracy on XYa
            RorW = 1 * (YsaPrd != Ysa)

        Eps = np.sum((Wsa * RorW)[-Na:]) / np.sum(
            Wsa[-Na:])  # Epss are only from A data
        Epss.append(1 - Eps)
        # adjust Eps
        if Eps >= 0.4:
            Eps = 0.4
        elif Eps <= 0:
            Eps = 0.01

        # Weight update
        if 1:
            Alpha = np.sqrt(Eps / (1 - Eps))
            # Alpha = Eps/(1-Eps) # This is the original update from Dai's
            Coef = np.concatenate(
                (Beta * np.ones(Ns), (1 / Alpha) * np.ones(Na)))
            wUpdate = np.power(Coef, RorW)
        else:
            # Update with momentum
            Alpha = np.sqrt((1 - Eps) / (1 + Eps))
            Ct = 2.5 * (1 - Eps)
            Coef = np.concatenate(
                (Ct * Beta * np.ones(Ns), Alpha * np.ones(Na)))
            wUpdate = np.power(Coef, -25 * RorW / nIters)
    # Now update
        Wsa = Wsa * wUpdate
        # result & summary
        Yprd = clf.predict(Xt)
        TestPrd[ni] = Yprd
        TestAcc.append(Metrics.Accuracy(Yt, Yprd))

        clf.fit(Xa, Ya, sample_weight=Psa[-Na:])
        AdaAcc.append(Metrics.Accuracy(Yt, clf.predict(Xt)))
        p.update(ni + 1)  # progress bar
    #    print(np.mean(Target))
    p.finish()
    return Weights, Epss, TestPrd, TestAcc, AdaAcc
示例#14
0
        p.update(ni + 1)  # progress bar
    #    print(np.mean(Target))
    p.finish()
    return Weights, Epss, TestPrd, TestAcc, AdaAcc


#============================================================
# 3 data set are tested. Synthetic, UCI heart desease, Amazon+Webcam
idChanged = []
Xs, Ys, Xa, Ya, Xt, Yt, idChanged = Datasets.gen_noisy_classi_data()
#Xs,Ys,Xa,Ya,Xt,Yt = Datasets.load_heart()
#Xs,Ys,Xa,Ya,Xt,Yt = Datasets.load_pics()

nIters = 50
# Baseline, from A to T
clf0 = RFC(n_estimators=5, criterion='entropy', max_depth=2)
clf0.fit(Xa, Ya)
Acc0 = Metrics.Accuracy(Yt, clf0.predict(Xt))

SPweights, Acc_auxi, All_test_prd, Acc_test, Acc_AdaOnly = MultiTrAdap(
    Xs, Ys, Xa, Ya, Xt, Yt, nIters=nIters)

PrdDf = pd.DataFrame.from_dict(All_test_prd)
HalfDf = PrdDf.iloc[:, round(nIters / 2):]  # use the last half only
BoostPrd = HalfDf.mode(axis=1)  # Boosting: simply vote
AccB = Metrics.Accuracy(Yt, BoostPrd[0])

# ================ Plot===========================
import matplotlib.pyplot as plt
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
 def __init__(self):
     self.model = RFC(n_estimators=10,n_jobs=7)#MLPClassifier([50, 10])#BernoulliNB()#
示例#16
0
orig_stdout = sys.stdout
o = open('trainingtime.txt', 'w')
sys.stdout = o

data = pd.concat([data_neg, data_pos])
data.index = range(len(data.index))

vectors_FI = data[data.columns[4:]]
labels_FI = data[data.columns[3]]


rfc1 = RFC(
    n_estimators=50,
    max_features=10,
    max_depth=30,
    min_samples_split=3,
    criterion="entropy",
    n_jobs=-1
)


start = time.time()

rfc1.fit(vectors_FI, labels_FI)

end = time.time()
print ('PROCESSING TIME = ', end - start)


FI = rfc1.feature_importances_
示例#17
0
            estimator_result = []
            for tree in self.trees:
                estimator_result.append(tree.predict(x.reshape(1, -1))[0])

            results.append(np.mean(estimator_result))
        return np.array(results)


if __name__ == "__main__":
    X, y = make_classification(n_samples=200,
                               n_features=8,
                               n_informative=4,
                               random_state=2)

    RF1 = RandomForestClassifier(n_estimators=10, max_depth=3)
    RF2 = RFC(n_estimators=10, max_depth=3)

    RF1.fit(X, y)
    res1 = RF1.predict(X)

    RF2.fit(X, y)
    res2 = RF2.predict(X)

    print('结果一样的比例', (np.abs(res1 - res2) < 1e-5).mean())

    X, y = make_regression(n_samples=200, random_state=2)

    RF1 = RandomForestRegressor(n_estimators=10, max_depth=3)
    RF2 = RFR(n_estimators=10, max_depth=3)

    RF1.fit(X, y)
示例#18
0
for cluster_id in np.unique(best_model.labels_):
    # print("Cluster", cluster_id)
    in_cluster = best_model.labels_ == cluster_id
    faces = X_train[in_cluster].reshape(-1, 64, 64)
    labels = y_train[in_cluster]
    # plot_faces(faces, labels)

from sklearn.ensemble import RandomForestClassifier as RFC
# rfc = RFC(n_estimators = 150, random_state = 42)
# rfc.fit(X_train_pca, y_train)
# print(rfc.score(X_valid_pca, y_valid)) # 0.9

X_train_reduced = best_model.transform(X_train_pca)
X_valid_reduced = best_model.transform(X_valid_pca)
X_test_reduced = best_model.transform(X_test_pca)
rfc = RFC(n_estimators=150, random_state=42)
rfc.fit(X_train_reduced, y_train)
# print(rfc.score(X_valid_reduced, y_valid)) # 0.75

from sklearn.pipeline import Pipeline
for n_clusters in k_range:
    pipeline = Pipeline([("kmeans",
                          KMeans(n_clusters=n_clusters,
                                 random_state=n_clusters)),
                         ("forest_clf", RFC(n_estimators=150,
                                            random_state=42))])
    pipeline.fit(X_train_pca, y_train)
    # print(n_clusters, pipeline.score(X_valid_pca, y_valid))

X_train_extended = np.c_[X_train_pca, X_train_reduced]
X_valid_extended = np.c_[X_valid_pca, X_valid_reduced]
示例#19
0
################################################################################
"""

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import sys
from time import time
import math

myClassifier1 = RFC(n_estimators=1000,
                    criterion="entropy",
                    min_samples_split=80,
                    max_features=None,
                    n_jobs=-1)

startTrainingTime = time()
myClassifier1.fit(features_train, labels_train)
print "The training time is: ", round(time() - startTrainingTime, 3), "seconds"

startPredictionTime = time()
myPredictions1 = myClassifier1.predict(features_test)
print "The prediction time is: ", round(time() - startPredictionTime,
                                        3), "seconds"

accuracy1 = accuracy_score(labels_test, myPredictions1)
print "Accuracy is: ", accuracy1
示例#20
0
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score

train_data = pd.read_csv(
    r'G:\pycharm_pyproject\AiLearning\my_try\caicai_try\dataset\digit_data\digit_data_train.csv'
)
X_raw = train_data.iloc[:, 1:]
Y = train_data.iloc[:, 0]
X_raw = X_raw.values
Y = Y.values.reshape(-1, 1)
print(X_raw.shape)
print(Y.shape)
print('*' * 50)

rfc = RFC(n_estimators=10, random_state=0)
feature_to_select = 340
step = 50
selector = RFE(rfc, n_features_to_select=feature_to_select,
               step=50).fit(X_raw, Y.flatten())

# support_:bool array表示特征是否被选中
# print(selector.support_)
# print('*'*50)
# print(selector.support_.sum())
# print('*'*50)
# ranking_:表示综合排名
# print(selector.ranking_)
# print('*'*50)

# X_wrapper = selector.transform(X_raw)
示例#21
0
    def RFCClassifier(cls, ):
        n_estimators_range = np.arange(10, 260, 30)
        param_grid = {'n_estimators': n_estimators_range}

        return cls(RFC(n_estimators=100), par_grid_dict=param_grid)
Default = data['Default']
data.drop(['Default'], axis=1, inplace=True)
plt.scatter(data['PAY_0'], data['PAY_2'])
#Removing multicollinear variables
data['PAY_5'] = (data['PAY_5'] + data['PAY_6']) / 2
data['PAY_2'] = (data['PAY_2'] + data['PAY_3']) / 2
multi = ['PAY_6', 'PAY_4', 'PAY_3']
data.drop(multi, axis=1, inplace=True)
data.drop(['PAY_2'], axis=1, inplace=True)

#Varying n_estimators in RFC and GBC to analyze the variance in accuracy
Accuracy = []
for i in range(1, 5):
    from sklearn.ensemble import RandomForestClassifier as RFC
    RFC = RFC(max_features=3, n_jobs=5, n_estimators=i * (10))
    RFC.fit(x_tr, y_tr)
    Accuracy.append(RFC.score(x_te, y_te))

accuracy = []
for i in range(1, 5):
    from sklearn.ensemble import GradientBoostingClassifier as RFC
    RFC = RFC(max_leaf_nodes=5, n_estimators=i * (10))
    RFC.fit(x_tr, y_tr)
    accuracy.append(RFC.score(x_te, y_te))

fig = plt.figure(figsize=(7, 7))
plt.plot(np.arange(1, 5), Accuracy, c='r')
plt.plot(np.arange(1, 5), accuracy, c='g')
plt.xlabel('variable')
plt.ylabel('accuracy')
示例#23
0
# print(type(allY_train[0]))
# print("\n\nSAX train:", SAX_train)
# print("\n\nSAX test:", SAX_train)
# print("\n\nSAY train:", SAY_train)
# print("\n\nSAY test:", SAY_train)

allY_test = allY_test.astype(float)
allY_train = allY_train.astype(float)
SAY_test = SAY_test.astype(float)
SAY_train = SAY_train.astype(float)

all_kNN = KNeighborsClassifier()
SA_kNN = KNeighborsClassifier()
all_Tree = tree.DecisionTreeClassifier()
SA_Tree = tree.DecisionTreeClassifier()
all_RandFor = RFC(n_estimators=25)
SA_RandFor = RFC(n_estimators=25)

all_kNN.fit(allX_train, allY_train)
SA_kNN.fit(SAX_train, SAY_train)
all_Tree.fit(allX_train, allY_train)
SA_Tree.fit(SAX_train, SAY_train)
all_RandFor.fit(allX_train, allY_train)
SA_RandFor.fit(SAX_train, SAY_train)

model_tuples = [(all_Tree, 'All Tree Classifier'),
                (SA_Tree, 'SA Tree Classifier'),
                (all_kNN, "All kNN Classfier"), (SA_kNN, 'SA kNN Classifier'),
                (all_RandFor, "All Random Forest Classifier"),
                (SA_RandFor, 'SA Random Forest Classifier')]
示例#24
0
    test_target = effects[test_split:]
    i = 0

    test_target = np.asarray(test_target)

    fp_train = []
    for mol in train_sm:
        fp_train.append(mol2imageT(mol, N=2048))
    fp_train = np.asarray(fp_train)

    fp_test = []
    for mol in test_sm:
        fp_test.append(mol2imageT(mol, N=2048))
    fp_test = np.asarray(fp_test)

    classifier = RFC(n_estimators=100, oob_score=True)
    '''
	print fp_train 
	print "------printing special fp-train-------"
	print fp_train[:,None]
	'''

    classifier.fit(fp_train, train_target.ravel())

    #using oob_for for data set
    pred_ans = classifier.predict(fp_test)
    #pred_ans = classifier.oob_decision_function_

    #get rounded answers
    binary_pred_ans = []
    for p in pred_ans:
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(
    training_data[features],
    training_data['target'],
    test_size=0.3,
    random_state=0)

# parameters
parameters = {
    'n_estimators': [20, 25],
    'random_state': [0],
    'max_features': [2],
    'min_samples_leaf': [150, 200, 250]
}

# implementing my classifier
model = RFC(n_jobs=-1)
grid = GS(estimator=model, param_grid=parameters)
grid.fit(features_train, labels_train)

# Calculate the logloss of the model
prob_predictions_class_test = grid.predict(features_test)
prob_predictions_test = grid.predict_proba(features_test)

logloss = log_loss(labels_test, prob_predictions_test)

accuracy = accuracy_score(labels_test,
                          prob_predictions_class_test,
                          normalize=True,
                          sample_weight=None)

# predict class probabilities for the tourney set
示例#26
0
	def run_CV(self):

		cvIter = 0
		
		totalInstanceNum = len(self.label)
		print("totalInstanceNum\t", totalInstanceNum)
		indexList = [i for i in range(totalInstanceNum)]

		print("featureNum", len(self.fn[0]))
		# print("non zero feature num", sum(self.fn[0]))

		totalTransferNumList = []
		np.random.seed(3)
		np.random.shuffle(indexList)

		foldNum = 10
		foldInstanceNum = int(totalInstanceNum*1.0/foldNum)
		foldInstanceList = []

		for foldIndex in range(foldNum-1):
			foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum]
			foldInstanceList.append(foldIndexInstanceList)

		foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):]
		foldInstanceList.append(foldIndexInstanceList)
		# kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
		cvIter = 0
		# random.seed(3)
		totalAccList = [0 for i in range(10)]

		posRatioList = []

		# self.PCAFeature(10)

		for foldIndex in range(foldNum):
			
			# self.m_clf = LinearSVC(random_state=3)
			# self.m_clf = SVC(random_state=3)
			# self.m_clf = LR(random_state=3)
			self.m_clf = RFC(random_state=3)

			train = []
			for preFoldIndex in range(foldIndex):
				train.extend(foldInstanceList[preFoldIndex])

			test = foldInstanceList[foldIndex]
			for postFoldIndex in range(foldIndex+1, foldNum):
				train.extend(foldInstanceList[postFoldIndex])

			# trainNum = int(totalInstanceNum*0.2)
			# print("trainNum", trainNum)
			
			fn_test = self.fn[test]
			label_test = self.label[test]

			fn_train = self.fn[train]
			label_train = self.label[train]

			testOneNum = np.sum(label_test)
			testNum = len(fn_test)

			posRatio = testOneNum*1.0/testNum
			posRatioList.append(posRatio)

			self.m_clf.fit(fn_train, label_train)

			label_preds = self.m_clf.predict(fn_test)
			acc = accuracy_score(label_test, label_preds)

			totalAccList[cvIter] = acc

			cvIter += 1      
		
		print("posRatioList", posRatioList, np.mean(posRatioList), np.sqrt(np.var(posRatioList)))

		print("totalAccList", totalAccList, np.mean(totalAccList), np.sqrt(np.var(totalAccList)))

		totalACCFile = modelVersion+".txt"
		f = open(totalACCFile, "w")
		for i in range(10):
			f.write(str(totalAccList[i]))
			# for j in range(totalAlNum):
			# 	f.write(str(totalAccList[i][j])+"\t")
			f.write("\n")
		f.close()
示例#27
0
def oneTrialWithCertainTrainSize(
        num_pos_sample=50,
        neg_pos_ratio=1,
        pos_training_dataset=None,
        pos_testing_dataset=None,
        neg_dataset=None,
        train_test_split=0,
        # obselete feature, keep default parameter to bypass, feature achieved by "num_pos_sample" param
        test_stratify=True,
        # obselete feature, keep default parameter to bypass, feature achieved by "num_pos_sample" param
        scoring="f1",
        plt_or_not=True):
    assert (type(pos_training_dataset) == list
            and type(neg_dataset) == list), "input datasets should be lists"

    num_neg_sample = int(num_pos_sample * neg_pos_ratio)

    # take sample of num_pos_sample number of positive examples
    (posPicked, posNotPicked) = takingSamples(pos_training_dataset,
                                              num=num_pos_sample)
    (negPicked, negNotPicked) = takingSamples(neg_dataset, num=num_neg_sample)

    # create train_X, train_y
    train_X = pd.DataFrame(posPicked + negPicked)
    train_y = np.array([1 for i in range(len(posPicked))] +
                       [0 for i in range(len(negPicked))])

    # create test_X and test_y
    if train_test_split != 0:
        testSize = int((num_pos_sample + num_neg_sample) / train_test_split *
                       (1 - train_test_split))  # size of test set
        if test_stratify:
            testPosSize = int(float(testSize) / (neg_pos_ratio + 1))
            testNegSize = testSize - testPosSize
            test_X = pd.DataFrame(
                takingSamples(posNotPicked, num=testPosSize)[0] +
                takingSamples(negNotPicked, num=testNegSize)[0])  #
            test_y = np.array([1 for i in range(testPosSize)] +
                              [0 for i in range(testNegSize)])
        else:
            for idx in range(len(posNotPicked)):
                posNotPicked[idx].append(1)
            for idx in range(len(negNotPicked)):
                negNotPicked[idx].append(0)
            test_X = pd.DataFrame(
                takingSamples(posNotPicked + negNotPicked, num=testSize)[0])

            test_y = np.array()
            for i in test_X:
                if i[-1] == 1:
                    test_y.append(1)
                else:
                    test_y.append(0)

            for idx in range(len(test_X)):
                del test_X[idx][-1]

    else:
        if (pos_testing_dataset == None):
            test_X = pd.DataFrame(posNotPicked + negNotPicked)
            test_y = np.array([1 for i in range(len(posNotPicked))] +
                              [0 for i in range(len(negNotPicked))])
        else:
            test_X = pd.DataFrame(pos_testing_dataset + negNotPicked)
            test_y = np.array([1 for i in range(len(pos_testing_dataset))] +
                              [0 for i in range(len(negNotPicked))])

    # train and test the model
    reg = RFC(n_estimators=100)
    # reg = RFC(n_estimators=200, max_features='log2')
    # reg = LogisticRegressionCV(scoring=scoring)
    LogModel = reg.fit(train_X, train_y)
    y_predlog = LogModel.predict_proba(test_X)
    y_predlog_1 = y_predlog[:, 1]

    prec, rec, thresholds = precision_recall_curve(test_y, y_predlog_1)
    if plt_or_not:
        plt.plot(rec, prec)
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title("Rec-Prec Curve of Logistic Regression Trials")

    # pred_combine sorted
    pred_combine = []
    for i in range(len(test_y)):
        pred_combine.append((y_predlog_1[i], test_y[i]))

    pred_combine = sorted(pred_combine, key=operator.itemgetter(0))

    # create an array of 0.1:0.01:0.99
    thres_new = []
    initial = 0.1
    while initial <= 0.99:
        thres_new.append(initial)
        initial += 0.01
        initial = round(initial, 2)

    # generate "threshold, prec, rec, f1" list
    # test_y is truth, y_predlog_1 is prob of being 1
    result = []
    item_index = 0

    FN_accu = 0
    TN_accu = 0
    TP_accu = list(test_y).count(1)
    FP_accu = list(test_y).count(0)

    for i in thres_new:  # i is [0.1:0.01:0.99]
        if (item_index < len(pred_combine)):
            while pred_combine[item_index][0] < i:
                if pred_combine[item_index][
                        1] == 1:  # this item actually 1, predict as 0
                    FN_accu += 1
                    TP_accu -= 1
                else:  # this item is actually 0, predict as 0, pred_combine[item_index][1] == 0
                    TN_accu += 1
                    FP_accu -= 1
                item_index += 1
                if (item_index == len(pred_combine)): break

        # print "th: " + str(i) + ", TP: " + str(TP_accu) + ", FP: " + str(FP_accu) + ", FN: " + str(FN_accu) + ", TN: " + str(TN_accu)

        if (TP_accu == 0):
            preci = 0
        else:
            preci = float(TP_accu) / (TP_accu + FP_accu)

        if (TP_accu == 0):
            recal = 0
        else:
            recal = float(TP_accu) / (FN_accu + TP_accu)

        if (2 * preci * recal == 0):
            fone = 0
        else:
            fone = 2 * preci * recal / (preci + recal)

        result.append([i, preci, recal, fone])

    return result  # 90
示例#28
0
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=text_process).fit(X)

print(len(vectorizer.vocabulary_))
X = vectorizer.transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import VotingClassifier as VC
mnb = MNB(alpha=10)
lr = LR(random_state=101)
rfc = RFC(n_estimators=80, criterion="entropy", random_state=42, n_jobs=-1)
clf = VC(estimators=[('mnb', mnb), ('lr', lr), ('rfc', rfc)], voting='hard')

clf.fit(X_train,y_train)

predict = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, predict))
print('\n')
print(classification_report(y_test, predict))


def predictor(s):
    s = vectorizer.transform(s)
    pre = clf.predict(s)
示例#29
0
import pandas as pd
import numpy as np
df = pd.read_csv("/home/shaury/Downloads/nptel/Iris.csv", delimiter=",")
x, y = df[["SepalLengthCm", "SepalWidthCm", "PetalLengthCm",
           "PetalWidthCm"]], df["Species"]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=1,
                                                    test_size=0.15)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
l1 = lda(n_components=1)
x_train = l1.fit_transform(x_train, y_train)
x_test = l1.fit_transform(x_test, y_test)

from sklearn.ensemble import RandomForestClassifier as RFC
cl = RFC(max_depth=2, random_state=0)
cl.fit(x_train, y_train)
y_pred = cl.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy' + str(accuracy_score(y_test, y_pred)))
                if word_count == 0:  # handle the exceptional case where tweet is empty.
                    tweet_mean_vec = np.zeros((K, 1))
                else:
                    tweet_mean_vec = sum_vec / word_count

                if file_no == 0:  # assign positive labels
                    tweet_label = 1
                elif file_no == 1:  # assign negative labels
                    tweet_label = -1
                else:
                    raise "Out Of Range File Error"

                x_train[:, index] = tweet_mean_vec.flatten()
                y_train[index] = tweet_label

    with open('train_data.pkl', 'wb') as train_data_picklefile:
        pickle.dump((x_train, y_train), train_data_picklefile)

else:
    with open('train_data.pkl', 'rb') as train_data_picklefile:
        x_train, y_train = pickle.load(train_data_picklefile)

# Classification
rfc = RFC()
print(extract_features('I am happy', K))
# print(vector_dict['good'])
# print(x_train.shape, y_train.shape)
# rfc.fit(np.transpose(x_train), y_train.flatten())
# print(rfc.predict(np.array(vector_dict['good']).reshape(1, K)))