예제 #1
0
파일: xval_demo.py 프로젝트: squell/kdd2013
if len(sys.argv) <= 1:
    print "so what pkl do you want me to read, hm?"
    sys.exit()

randomForest = ensemble.RandomForestClassifier(
    verbose=False, n_estimators=80, min_samples_split=10, max_depth=14, bootstrap=False, n_jobs=16
)

randomForestRegress = ensemble.RandomForestClassifier(
    verbose=True, n_estimators=80, min_samples_split=10, max_depth=14, n_jobs=15
)

gradBoost = ensemble.GradientBoostingClassifier(verbose=True, n_estimators=100, min_samples_split=10, max_depth=7)

with open(sys.argv[1]) as infile:
    train, _ = pickle.load(infile)

if type(train[1][0]) == list:
    ids, info, labels = kddutil.notrash(*train)
    info = kddutil.bound(info, max=10000, min=-10000)
else:
    ids, info, labels = train
    print "assuming compacted data; skip preprocessing"

print "Random Forest"
print kddutil.evaluate_k(randomForest, ids, info, labels, fold=3)  # , postprocess=kddutil.disambiguate)

# print "Gradient Boosting"
# print kddutil.evaluate(gradBoost, ids, info, labels)
예제 #2
0
   - also removes duplicates, and infinities
'''

import kddutil
import cPickle as pickle
import numpy as np
import sys

if len(sys.argv) <= 2:
    print "pickelino pickelouto"
    sys.exit()

print "reading"
with open(sys.argv[1],"rb") as infile:
    train, test = pickle.load(infile)

print "numpyifying"

id, feat, lab = kddutil.notrash(*train)
feat = kddutil.bound(feat, min=-10000, max=+10000)
train = np.array(id, np.int32), np.array(feat, np.float16), np.array(lab, np.bool)

id, feat = kddutil.notrash(*test)
feat = kddutil.bound(feat, min=-10000, max=+10000)
test = np.array(id, np.int32), np.array(feat, np.float16)

print "writing"
with open(sys.argv[2],"wb") as outfile:
    pickle.dump((train,test), outfile)

예제 #3
0
파일: predict.py 프로젝트: squell/kdd2013
        , n_jobs=16
        )

gradBoost = ensemble.GradientBoostingClassifier(verbose=True
        , n_estimators=100
        , min_samples_split=10
        , max_depth=5
        )

classifier = randomForest

with open(sys.argv[1]) as infile:
    train, test = pickle.load(infile)

if type(train[1][0]) == list:
    train_ids, train_set, labels = kddutil.notrash(*train)
    test_ids, test_set = kddutil.notrash(*test)

    train_set = kddutil.bound(train_set, max=10000, min=-10000)
    test_set  = kddutil.bound(test_set,  max=10000, min=-10000)
else:
    train_ids, train_set, labels = train
    test_ids, test_set = test
    print "assuming compacted data; skip preprocessing"

classifier.fit(train_set, labels)

predictions = classifier.predict_proba(test_set)[:,1]

print "writing to output.csv"
kddutil.write_csv(test_ids, predictions)