from pprint import pprint import os import csv import cPickle as pickle import numpy as np import scipy as sp from feature_label_builder import FeatureLabelBuilder, get_feature_keys from feature_label_builder import data_path, full_path from util_eval import multiclass_log_loss, multiclass_accuracy import xgboost as xgb # load data (all_xs, all_ys) = pickle.load(open(full_path("train_xs_ys_np.p"), "rb")) all_ys = all_ys - 1.0 # set up split portion of train and test data test_percentage = 0.1 all_num = len(all_ys) train_num = int(round((1.0 - test_percentage) * all_num)) test_num = all_num - train_num # prepare random shuffle index random_idx = np.array(range(all_num)) np.random.seed(0) np.random.shuffle(random_idx) # prepare train and test dataset train_xs = all_xs[random_idx][:train_num]
from dateutil import parser as dateparser import datetime, time import numpy as np import scipy as sp from feature_label_builder import FeatureLabelBuilder, get_feature_keys from feature_label_builder import data_path, full_path # reader = csv.DictReader(open(os.path.join(data_path, 'train.csv'))) # post_times = [int(time.mktime(dateparser.parse(datum['PostCreationDate']).timetuple())) # for datum in reader] # post_times = np.array(post_times) # pickle.dump(post_times, open(full_path("post_times_37.p"), "wb"), # protocol=pickle.HIGHEST_PROTOCOL) post_times = np.array(pickle.load(open(full_path("post_times_37.p"), "rb"))) sort_arg = np.argsort(post_times) def is_sorted(l): return all(l[i] <= l[i+1] for i in xrange(len(l)-1)) print(is_sorted(post_times)) print(is_sorted(post_times[sort_arg])) (train_xs, train_ys) = pickle.load(open(full_path("train_37_xs_ys_np.p"), "rb")) train_xs = train_xs[sort_arg] train_ys = train_ys[sort_arg] pickle.dump((train_xs, train_ys), open(full_path("train_37_xs_ys_np_sorted.p"), "wb"), protocol=pickle.HIGHEST_PROTOCOL)
import os import csv import cPickle as pickle import numpy as np import scipy as sp from feature_label_builder import FeatureLabelBuilder, get_feature_keys from feature_label_builder import data_path, full_path from util_eval import multiclass_log_loss, multiclass_accuracy import xgboost as xgb import itertools # load data (all_xs, all_ys) = pickle.load(open(full_path("train_37_xs_ys_np_sorted.p"), "rb")) all_ys = all_ys - 1.0 # set up split portion of train and test data test_percentage = 0.1 all_num = len(all_ys) train_num = int(round((1. - test_percentage) * all_num)) test_num = all_num - train_num # prepare train and test dataset train_xs = all_xs[:train_num] train_ys = all_ys[:train_num] test_xs = all_xs[train_num:] test_ys = all_ys[train_num:] # setup param grid
from __future__ import print_function from pprint import pprint import os import csv import cPickle as pickle import numpy as np import scipy as sp import sklearn from sklearn.ensemble import GradientBoostingClassifier as GBC from feature_label_builder import FeatureLabelBuilder, get_feature_keys from feature_label_builder import data_path, full_path (train_xs, train_ys) = pickle.load(open(full_path("train_xs_ys_np.p"), "rb")) classifier = GBC(n_estimators=500, learning_rate=0.15, subsample=0.001, max_features='auto', min_samples_leaf=9, verbose=1) classifier.fit(train_xs, train_ys) pickle.dump(classifier, open(full_path("classifier_500_0.1_0.001_9.p"), "wb")) # keys = get_feature_keys() # for key, val in zip(keys, classifier.feature_importances_): # print(key, val)