if len(sys.argv) <= 1: print "so what pkl do you want me to read, hm?" sys.exit() randomForest = ensemble.RandomForestClassifier( verbose=False, n_estimators=80, min_samples_split=10, max_depth=14, bootstrap=False, n_jobs=16 ) randomForestRegress = ensemble.RandomForestClassifier( verbose=True, n_estimators=80, min_samples_split=10, max_depth=14, n_jobs=15 ) gradBoost = ensemble.GradientBoostingClassifier(verbose=True, n_estimators=100, min_samples_split=10, max_depth=7) with open(sys.argv[1]) as infile: train, _ = pickle.load(infile) if type(train[1][0]) == list: ids, info, labels = kddutil.notrash(*train) info = kddutil.bound(info, max=10000, min=-10000) else: ids, info, labels = train print "assuming compacted data; skip preprocessing" print "Random Forest" print kddutil.evaluate_k(randomForest, ids, info, labels, fold=3) # , postprocess=kddutil.disambiguate) # print "Gradient Boosting" # print kddutil.evaluate(gradBoost, ids, info, labels)
- also removes duplicates, and infinities ''' import kddutil import cPickle as pickle import numpy as np import sys if len(sys.argv) <= 2: print "pickelino pickelouto" sys.exit() print "reading" with open(sys.argv[1],"rb") as infile: train, test = pickle.load(infile) print "numpyifying" id, feat, lab = kddutil.notrash(*train) feat = kddutil.bound(feat, min=-10000, max=+10000) train = np.array(id, np.int32), np.array(feat, np.float16), np.array(lab, np.bool) id, feat = kddutil.notrash(*test) feat = kddutil.bound(feat, min=-10000, max=+10000) test = np.array(id, np.int32), np.array(feat, np.float16) print "writing" with open(sys.argv[2],"wb") as outfile: pickle.dump((train,test), outfile)
, n_jobs=16 ) gradBoost = ensemble.GradientBoostingClassifier(verbose=True , n_estimators=100 , min_samples_split=10 , max_depth=5 ) classifier = randomForest with open(sys.argv[1]) as infile: train, test = pickle.load(infile) if type(train[1][0]) == list: train_ids, train_set, labels = kddutil.notrash(*train) test_ids, test_set = kddutil.notrash(*test) train_set = kddutil.bound(train_set, max=10000, min=-10000) test_set = kddutil.bound(test_set, max=10000, min=-10000) else: train_ids, train_set, labels = train test_ids, test_set = test print "assuming compacted data; skip preprocessing" classifier.fit(train_set, labels) predictions = classifier.predict_proba(test_set)[:,1] print "writing to output.csv" kddutil.write_csv(test_ids, predictions)