예제 #1
0
class ExtraTreesClassifierImpl():

    def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'):
        self._hyperparams = {
            'n_estimators': n_estimators,
            'criterion': criterion,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'min_weight_fraction_leaf': min_weight_fraction_leaf,
            'max_features': max_features,
            'max_leaf_nodes': max_leaf_nodes,
            'min_impurity_decrease': min_impurity_decrease,
            'min_impurity_split': min_impurity_split,
            'bootstrap': bootstrap,
            'oob_score': oob_score,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'verbose': verbose,
            'warm_start': warm_start,
            'class_weight': class_weight}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
def classify(X,y,cv):
    #clf = DecisionTreeClassifier()
    #clf = RandomForestClassifier()
    #clf = AdaBoostClassifier()
    clf = ExtraTreesClassifier()
    score = cross_val_score(clf, X, y, cv=cv)
    print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0])
    clf = clf.fit(X,y)
    #print 'Feature Importances'
    #print clf.feature_importances_
    #X = clf.transform(X,threshold=.3)
    
    preds = clf.predict(X)
    print 'predictions counter'
    print Counter(clf.predict(X))
    fp=0
    tp=0
    fn=0
    tn=0
    for a in range(len(y)):
        if y[a]==preds[a]:
            if preds[a]==0:
                tn+=1
            elif preds[a]==1:
                tp+=1
        elif preds[a]==1:fp+=1
        elif preds[a]==0:fn+=1
    
    print 'correct positives:', tp
    print 'correct negatives:', tn
    print 'false positives:', fp
    print 'false negatives:', fn
    print 'precision:',float(tp)/(tp+fp)
    print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn)
    print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn)
    print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp)
    print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') 
    return clf
x_train, x_test, y_train, y_test = train_test_split(
    dfeatures[features],
    dfeatures.author_num.values,
    test_size=0.4,
    random_state=123)
x, y = dfeatures[features], dfeatures.author_num.values

# CLASSIFIER
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

scores = cross_val_score(etclf, x, y)
print scores.mean()

# Print Confusion Matrix
print metrics.confusion_matrix(etclf.predict(x_test), y_test)
# print authors
"""
# # PREVIOUS RESULT 0.671469386087

############# RESULT WITH ALL FEATURES ############
/Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
0.148101533384
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 2 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn import metrics
from sklearn import preprocessing

authorship = read_csv("http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv")
authors = list(set(authorship.Author.values))
le = preprocessing.LabelEncoder()
le.fit(authors)
authorship["Author_num"] = le.transform(authorship["Author"])

# What are some of the stop words we're looking at?
features = list(authorship.columns)
features
features.remove("Author")
features.remove("Author_num")

# Create a random variable (random forests work best with a random variable)
# and create a test and training set
authorship["random"] = [random.random() for i in range(841)]
x_train, x_test, y_train, y_test = train_test_split(
    authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123
)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)
예제 #5
0
import random
from pandas import read_csv
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn import metrics
from sklearn import preprocessing
authorship = read_csv('http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv')
authors = list(set(authorship.Author.values))
le = preprocessing.LabelEncoder()
le.fit(authors)
authorship['Author_num'] = le.transform(authorship['Author'])

#What are some of the stop words we're looking at?
features = list(authorship.columns)
features
features.remove('Author')
features.remove('Author_num')

# Create a random variable (random forests work best with a random variable)
# and create a test and training set
authorship['random'] = [random.random() for i in range(841)]
x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123)


# Fit Model
etclf = ExtraTreesClassifier(n_estimators=20)
etclf.fit(x_train, y_train)

# Print Confusion Matrix
metrics.confusion_matrix(etclf.predict(x_test), y_test)
print metrics.classification_report(etclf.predict(x_test), y_test)
예제 #6
0
# also tested this:
# svm.SVC(kernel='linear', C=1.0), GaussianNB()
# doesn't improve and takes long

#running crossvalidation score on all classifiers
for clf in classifiers:
    score = cross_val_score(clf, X, y, cv=cv)
    print "%s \n Accuracy: %0.2f (+/- %0.2f)\n" % (clf, score.mean(), score.std() / 2)

#now let's go to OOS test
testX = test[['Sex01','Fare','SibSp','Parch','Pclass']]
medianFare = testX.Fare.median()
testX.Fare = testX.Fare.fillna(medianFare)

#print results to CSV files for Kaggle submission
clf = ExtraTreesClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('ETClf.csv',index=False)

clf = RandomForestClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('RFClf.csv',index=False)

clf = DecisionTreeClassifier()
clf.fit(X, y)
test['Survived'] = pd.Series(clf.predict(testX))
test[['PassengerId','Survived']].to_csv('DTClf.csv',index=False)

예제 #7
0
def main():
    ### Import data sets
    l_train = pd.read_csv('lemon_training.csv')
    l_test = pd.read_csv('lemon_test.csv')


    ### Clean/prepare data sets
    l_train = l_train.dropna(axis=1)
    l_test = l_test.dropna(axis=1)

    features = list(l_train.describe().columns)
    features.remove('RefId')
    features.remove('IsBadBuy')


    ### Create test and training sets
    train_features = l_train[features].values
    train_class = l_train.IsBadBuy.values
    OSS_features = l_test[features].values

    # Seed PRNG
    np.random.seed(1234)
    X_train, X_test, y_train, y_test = \
        cross_validation.train_test_split(train_features, train_class, test_size=.3)


    ### Build model
    # model = naive_bayes.GaussianNB().fit(X_train, y_train)
    model = ExtraTreesClassifier(max_depth=8).fit(X_train, y_train)
    model.score(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)


    ### Stats
    print 'training:\n', metrics.confusion_matrix(y_train, y_pred_train)
    print metrics.classification_report(y_train, y_pred_train)
    print 'test:\n', metrics.confusion_matrix(y_test, y_pred_test)
    print metrics.classification_report(y_test, y_pred_test)
    fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, y_pred_train, pos_label=1)
    fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1)
    print 'train MA: ', model.score(X_train, y_train)
    print 'test MA: ', model.score(X_test, y_test)
    print 'train AUC: ', metrics.auc(fpr_train, tpr_train)
    print 'test AUC: ', metrics.auc(fpr_test, tpr_test)



    # Cross Validation
    AUCs = []
    # for i in xrange(10):
    #     X_train, X_test, y_train, y_test = \
    #     cross_validation.train_test_split(train_features, train_class, test_size=.3)
    #     y_pred_test = model.fit(X_train, y_train).predict(X_test)
    #     fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1)
    #     AUCs.append(metrics.auc(fpr_test, tpr_test))
        
    # print 'AUC cross val: ', AUCs


    ### Do output predicitons for OSS data
    OSS_features = l_test[features].values
    y_pred_OSS = model.predict(OSS_features)
    submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred_OSS })
예제 #8
0
# Apply Some Featuring
poly_reg = PolynomialFeatures(degree=1)

# Transform into numpy object
x_train = poly_reg.fit_transform(X_train)
X_test = poly_reg.fit_transform(X_test)
y_test = np.array(y_test.ix[:,0])
y_train = np.array(y_train.ix[:,0])

# Build model with good params
model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.6, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# Fit the model
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Scoring
if regression:
    print('Score on test set:', mean_absolute_error(y_test, y_pred))
else:
    print('Score on test set:', accuracy_score(y_test, y_pred))