示例#1
0
import numpy as np
import mungetools as mg
from sklearn.ensemble import RandomForestClassifier as rfc

# load data into pandas data frame
trdata, testdata = mg.loadData()

# get the id's for the test set
testid = np.array(testdata.UserID)

testdata = testdata.drop('UserID', axis=1)

# initialize classifier

depthlist = [3, 5, 10, 15, 20, 50, 100]

for i in depthlist:

    model = rfc(n_estimators=10,
                oob_score=True,
                max_features=None,
                max_depth=i)

    model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0])

    accur = model.oob_score_

    print('Out of Bag accuracy: %f \n' % accur)

# generate predictions
preds = np.array(model.predict_proba(testdata))[:, 1]
示例#2
0
import numpy as np
import mungetools as mg
from sklearn.ensemble import RandomForestClassifier as rfc

'''
Use random forrest classifier to predict Titanic survivors
Uses training data in train.csv (found in data subfolder)
predicts from test.csv
writes out to .csv in predictions subfolder
As is, this gives ~77% accuracy on test set
This can hit ~79% with some tweaking (currently overfits)
'''

# load data into pandas data frame
trdata,testdata=mg.loadData()

# get the id's for the test set
testid = np.array(testdata.PassengerId)

# determine if each passenger has a known surviving family member
trdata,tesrdata=mg.addFamSurvivors(trdata,testdata)

# munge the data to generate one-hot labels for gender, titles, ticket departments
trdata=mg.mungeData(trdata)
testdata=mg.mungeData(testdata)


# initialize classifier

model= rfc(n_estimators=1000,oob_score=True,compute_importances=True)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
# from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline
import mungetools as mg

n_estimator = 10
# X, y = make_classification(n_samples=10)

# print X
# print y

trdata = mg.loadData()

X_train, X_test, y_train, y_test = trdata.iloc[:500, 1:], trdata.iloc[
    500:, 1:], trdata.iloc[:500, 0], trdata.iloc[500:, 0]
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = X_train.iloc[:250, ], X_train.iloc[
    250:, ], y_train.iloc[:250, ], y_train.iloc[250:, ]

print X_train
print y_train

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,
            scores[counter]=np.mean(scorei)
            paramholder[counter,0]=c
            paramholder[counter,1]=g
            if scorei>bestscore:
                bestscore=scorei
                bestmodel=model
            counter+=1
            print('Score = %f with c: %f, g: %f' %(scorei,c,g))
    bestc=paramholder[scores.argmax(),0]
    bestg=paramholder[scores.argmax(),1]
    print('Best score of %f with c: %f, g: %f' %(bestscore,bestc,bestg))
    return bestmodel

    

trdata=mg.loadData()

# testid = np.array(testdata.UserID)

# trdata = trdata.drop(['coursecount'],axis=1)

# testdata = testdata.drop(['UserID'],axis=1)

# print trdata
# print testdata

# initialize classifier
# try several values of c (prediction error weight) and g (kernel width)
testc = [0.05, 0.1, 0.3, 0.6, 1, 3, 5, 10 ]
testg = [0, 0.01, 0.05, 0.1, 0.5, 1, 1.5]
features_list = []


for line in file_data:
    if '@' in line and '{' in line:
        feature = line.split()[1]
        features_list.append(feature)

# print features_list

features_list = np.asarray(features_list)

print type(features_list)

input_df=mg.loadData()
X = input_df.values[:, 1:]
y = input_df.values[:, 0]
survived_weight = .75
y_weights = np.array([survived_weight if s == 0 else 1 for s in y])
 
print "Rough fitting a RandomForest to determine feature importance..."
forest = RandomForestClassifier(oob_score=True, n_estimators=10)
forest.fit(X, y, sample_weight=y_weights)
feature_importance = forest.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())

print feature_importance

fi_threshold = 30   
important_idx = np.where(feature_importance > fi_threshold)[0]