from sklearn.naive_bayes import BernoulliNB from sklearn import svm from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression import time import numpy as np from sklearn.preprocessing import LabelBinarizer from sklearn.externals import joblib #features = ['DayOfWeekNo', 'PdDistrict_ID', 'Year', 'Month', 'Day', 'Hour', 'X_quan', 'Y_quan', 'Street1_ID', 'Street2_ID'] print 'preparing training data...' if not os.path.exists('train_clean.csv'): train_raw = pd.read_csv('train.csv') train_prepared = munge(train_raw, 'train_clean.csv') else: train_prepared = pd.read_csv('train_clean.csv') neg_fea = [u'X_quan', u'Y_quan', u'X', u'Y', u'Category', u'Descript', u'DayOfWeek', u'PdDistrict', u'Resolution', u'Category_ID', u'Address', u'PdDistrict_ID', u'DayOfWeekNo', u'Day', u'Month', u'Year', u'Hour', u'Dates'] y = train_prepared['Category_ID'] X = train_prepared.drop(neg_fea, 1) X = X.drop(X.columns[0], 1) # free up memory occupied by train_prepared del train_prepared train_prepared = None print 'Fitting...'
for dc in cols_to_drop: if re.match(dc, col): data.drop(col, axis=1, inplace=True) break def singly_predict(clf, trainX, trainY, testX): clf.fit(trainX, trainY) results = clf.predict(testX) write_results(results) if os.path.isfile('train_clean.csv'): train_data = pd.read_csv('train_clean.csv') else: train_data = munge('train') if os.path.isfile('test_clean.csv'): test_data = pd.read_csv('test_clean.csv') else: test_data = munge('test') y = train_data['Hazard'] X = train_data.drop(['Id', 'Hazard'], axis=1) Id = test_data['Id'] test_data.drop('Id', axis=1, inplace=True) # drop cols Dropcols = ['T2_V10', 'T2_V7', 'T1_V13', 'T1_V10']