예제 #1
0
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.externals import joblib

#features = ['DayOfWeekNo', 'PdDistrict_ID', 'Year', 'Month', 'Day', 'Hour', 'X_quan', 'Y_quan', 'Street1_ID', 'Street2_ID']

print 'preparing training data...'
if not os.path.exists('train_clean.csv'):
    train_raw = pd.read_csv('train.csv')
    train_prepared = munge(train_raw, 'train_clean.csv')
else:
    train_prepared = pd.read_csv('train_clean.csv')

neg_fea = [u'X_quan', u'Y_quan', u'X', u'Y', u'Category', u'Descript', u'DayOfWeek', u'PdDistrict', u'Resolution', u'Category_ID',
           u'Address', u'PdDistrict_ID', u'DayOfWeekNo', u'Day', u'Month', u'Year', u'Hour', u'Dates']

y = train_prepared['Category_ID']
X = train_prepared.drop(neg_fea, 1)
X = X.drop(X.columns[0], 1)

# free up memory occupied by train_prepared
del train_prepared
train_prepared = None

print 'Fitting...'
예제 #2
0
        for dc in cols_to_drop:
            if re.match(dc, col):
                data.drop(col, axis=1, inplace=True)
                break


def singly_predict(clf, trainX, trainY, testX):
    clf.fit(trainX, trainY)
    results = clf.predict(testX)
    write_results(results)


if os.path.isfile('train_clean.csv'):
    train_data = pd.read_csv('train_clean.csv')
else:
    train_data = munge('train')

if os.path.isfile('test_clean.csv'):
    test_data = pd.read_csv('test_clean.csv')
else:
    test_data = munge('test')


y = train_data['Hazard']
X = train_data.drop(['Id', 'Hazard'], axis=1)

Id = test_data['Id']
test_data.drop('Id', axis=1, inplace=True)

# drop cols
Dropcols = ['T2_V10', 'T2_V7', 'T1_V13', 'T1_V10']