import numpy as np import mungetools as mg from sklearn.ensemble import RandomForestClassifier as rfc # load data into pandas data frame trdata, testdata = mg.loadData() # get the id's for the test set testid = np.array(testdata.UserID) testdata = testdata.drop('UserID', axis=1) # initialize classifier depthlist = [3, 5, 10, 15, 20, 50, 100] for i in depthlist: model = rfc(n_estimators=10, oob_score=True, max_features=None, max_depth=i) model = model.fit(trdata.iloc[:, 1:], trdata.iloc[:, 0]) accur = model.oob_score_ print('Out of Bag accuracy: %f \n' % accur) # generate predictions preds = np.array(model.predict_proba(testdata))[:, 1]
import numpy as np import mungetools as mg from sklearn.ensemble import RandomForestClassifier as rfc ''' Use random forrest classifier to predict Titanic survivors Uses training data in train.csv (found in data subfolder) predicts from test.csv writes out to .csv in predictions subfolder As is, this gives ~77% accuracy on test set This can hit ~79% with some tweaking (currently overfits) ''' # load data into pandas data frame trdata,testdata=mg.loadData() # get the id's for the test set testid = np.array(testdata.PassengerId) # determine if each passenger has a known surviving family member trdata,tesrdata=mg.addFamSurvivors(trdata,testdata) # munge the data to generate one-hot labels for gender, titles, ticket departments trdata=mg.mungeData(trdata) testdata=mg.mungeData(testdata) # initialize classifier model= rfc(n_estimators=1000,oob_score=True,compute_importances=True)
from sklearn.linear_model import LogisticRegression from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.preprocessing import OneHotEncoder # from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve from sklearn.pipeline import make_pipeline import mungetools as mg n_estimator = 10 # X, y = make_classification(n_samples=10) # print X # print y trdata = mg.loadData() X_train, X_test, y_train, y_test = trdata.iloc[:500, 1:], trdata.iloc[ 500:, 1:], trdata.iloc[:500, 0], trdata.iloc[500:, 0] # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = X_train.iloc[:250, ], X_train.iloc[ 250:, ], y_train.iloc[:250, ], y_train.iloc[250:, ] print X_train print y_train # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3,
scores[counter]=np.mean(scorei) paramholder[counter,0]=c paramholder[counter,1]=g if scorei>bestscore: bestscore=scorei bestmodel=model counter+=1 print('Score = %f with c: %f, g: %f' %(scorei,c,g)) bestc=paramholder[scores.argmax(),0] bestg=paramholder[scores.argmax(),1] print('Best score of %f with c: %f, g: %f' %(bestscore,bestc,bestg)) return bestmodel trdata=mg.loadData() # testid = np.array(testdata.UserID) # trdata = trdata.drop(['coursecount'],axis=1) # testdata = testdata.drop(['UserID'],axis=1) # print trdata # print testdata # initialize classifier # try several values of c (prediction error weight) and g (kernel width) testc = [0.05, 0.1, 0.3, 0.6, 1, 3, 5, 10 ] testg = [0, 0.01, 0.05, 0.1, 0.5, 1, 1.5]
features_list = [] for line in file_data: if '@' in line and '{' in line: feature = line.split()[1] features_list.append(feature) # print features_list features_list = np.asarray(features_list) print type(features_list) input_df=mg.loadData() X = input_df.values[:, 1:] y = input_df.values[:, 0] survived_weight = .75 y_weights = np.array([survived_weight if s == 0 else 1 for s in y]) print "Rough fitting a RandomForest to determine feature importance..." forest = RandomForestClassifier(oob_score=True, n_estimators=10) forest.fit(X, y, sample_weight=y_weights) feature_importance = forest.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) print feature_importance fi_threshold = 30 important_idx = np.where(feature_importance > fi_threshold)[0]