def get_data(): tickets_file = csv.reader(open('2012-10-09.close.csv')) time_format = '%Y-%m-%d %H:%M:%S' tickets = [] times = [] reporters = [] subjects = [] for number, created, changetime, closetime, reporter, summary, status, \ owner, tkt_type, component, description in tickets_file: row = [] created = dt.datetime.strptime(created, time_format) closetime = dt.datetime.strptime(closetime, time_format) changetime = dt.datetime.strptime(changetime, time_format) time_to_fix = closetime - created row.append(float(number)) row.append(float(time.mktime(created.timetuple()))) tickets.append(row) times.append(total_seconds(time_to_fix)) reporters.append(reporter) subjects.append(summary) scaler = preprocessing.Scaler().fit(np.array(tickets)) tickets = sp.csr_matrix(scaler.transform(tickets)) tickets = sp.hstack((tickets, TfidfTransformer().fit_transform( CountVectorizer().fit_transform(reporters)))) tickets = sp.hstack((tickets, TfidfTransformer().fit_transform( CountVectorizer(ngram_range=(1, 3)).fit_transform(subjects)))) scaler = preprocessing.Scaler(with_mean=False).fit(tickets) tickets = scaler.transform(tickets) return tickets, times
def train_ensemble_adjective_classifier(train_feature_objects, adjective, classifiers, scalers, feature_name_list): ''' Given build classifier of 5 motions - train the single joined classifier Returns a single classifier with its associated scaler IMPORTANT: the feature vector of probabilities needs to be created in a specific order 'tap', 'squeeze', 'thermal_hold', 'slide', 'slide_fast' ''' # Pull out the features probability_vector, probability_labels, object_ids, weights = build_ensemble_feature_vector( train_feature_objects, adjective, classifiers, scalers, feature_name_list) # Create scaler for the features scaler = preprocessing.Scaler().fit(probability_vector) # Train a single SVM svm = train_svm(probability_vector, probability_labels, object_ids) return (svm, scaler, weights)
def get_data(): tickets_file = csv.reader(open('2012-10-09.close.csv')) tickets = [] times = [] time_format = '%Y-%m-%d %H:%M:%S' for number, created, changetime, closetime, reporter, summary, status, \ owner, tkt_type, component, description in tickets_file: row = [] created = dt.datetime.strptime(created, time_format) closetime = dt.datetime.strptime(closetime, time_format) changetime = dt.datetime.strptime(changetime, time_format) time_to_fix = closetime - created row.append(float(number)) row.append(float(time.mktime(created.timetuple()))) tickets.append(row) times.append(total_seconds(time_to_fix)) scaler = preprocessing.Scaler().fit(np.array(tickets)) tickets = scaler.transform(tickets) return tickets, times
def full_ensemble_train(train_feature_vector_dict, train_adjective_dict, test_feature_vector_dict, test_adjective_dict): """ """ # Open text file for storing classification reports ensemble_report_file = open("Full_Ensemble_Report.txt", "w") all_ensemble_classifiers = dict() # For all adjectives for adj in train_adjective_dict: # Create ensemble scaler scaler = preprocessing.Scaler().fit(train_feature_vector_dict[adj]) # Run SVM ensemble_svm, ensemble_proba, ensemble_score, ensemble_report = train_svm( train_feature_vector_dict[adj], train_adjective_dict[adj], test_feature_vector_dict[adj], test_adjective_dict[adj], scaler, cv_flag=False) all_ensemble_classifiers[adj] = ensemble_svm # Write classification reports into text file ensemble_report_file.write('Adjective: ' + adj + '\n') ensemble_report_file.write(ensemble_report) ensemble_report_file.write('\n\n') return all_ensemble_classifiers
def create_and_save_scaler(data): """Create a scaler for the given data and save it to the disk.""" scaler = preprocessing.Scaler().fit(data) create_classifier_dir() joblib.dump( scaler, os.path.join(classifiers_dir, scaler.__class__.__name__ + '.pkl')) return scaler
def create_scalers(train_feature_vector_dict): """ Takes in the training feature vector dictionary, generates a scaler for each motion, and then returns the scalers """ scaler_dict = dict() for motion_name in train_feature_vector_dict: scaler_dict[motion_name] = preprocessing.Scaler().fit( train_feature_vector_dict[motion_name][0]) return scaler_dict
def init_classifier(self, filename): """Unpickle svm training data, train classifier""" with open(filename, 'rb') as f: svm_data = pickle.load(f) labels = svm_data['labels'] data = svm_data['data'] scaler = pps.Scaler().fit(data) data_scaled = scaler.transform(data) classifier = svm.SVC() classifier.fit(data_scaled, labels) return (scaler, classifier)
def est_gradient_decsent(self): iris = datasets.load_iris() X = iris.data scaler = pre.Scaler() X = scaler.fit_transform(X) y = self.all_to_sparse(iris.target, max(iris.target) + 1) X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets( np.array(X), np.array(y)) thetas, costs, val_costs = neur.gradient_decent( np.array(X), np.array(y), np.array(X_val), np.array(y_val))
def train_weak_classifier_adjective(train_feature_objects, adjective, feature_dictionary): ''' takes in a dictionary of all features returns a dictionary of weak classifiers for each feature ''' # specify feature to be extracted feature_name_list = [ "pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc", "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit", "gripper_min", "gripper_mean", "transform_distance", "electrode_polyfit" ] # store weak svms svm_motion_store = dict() # store scalers scaler_motion_store = dict() # store scalers classifiers = dict() # for each motion (slide, squeeze, etc.) for motion in train_feature_objects: motion_train_set = train_feature_objects[motion] # pull out the features specified as a vector train_feature_vector, train_label_dict = utilities.feature_obj_2_feature_vector( motion_train_set, feature_name_list) # create scaler scaler_motion_store[motion] = preprocessing.Scaler().fit( train_feature_vector) train_feature_vector_scaled = scaler_motion_store[motion].transform( train_feature_vector) params = { 'n_estimators': 1000, 'max_depth': 4, 'min_samples_split': 1, 'learn_rate': 0.01, 'loss': 'deviance' } #params = {'n_estimators': 1, 'max_depth': 1, 'min_samples_split': 1,'learn_rate': 0.1, 'loss': 'deviance'} clf = ensemble.GradientBoostingClassifier(**params) clf.fit(train_feature_vector_scaled, train_label_dict[1][adjective]) #clf = train_gradient_boost(train_feature_vector_scaled, train_label_dict[1][adjective], train_label_dict[0]) classifiers[motion] = clf return (classifiers, scaler_motion_store)
def __init__(self, product_cluster_center, category_cluster_center, product_cluster_50pc_dist, product_cluster_80pc_dist, category_cluster_50pc_dist, category_cluster_80pc_dist, scaler_mean, scaler_std): ''' Constructor ''' self.clusters_properties = {"prod": [product_cluster_center, product_cluster_50pc_dist, product_cluster_80pc_dist], "cat": [category_cluster_center, category_cluster_50pc_dist, category_cluster_80pc_dist]} scaler = preprocessing.Scaler() scaler.mean_ = scaler_mean scaler.std_ = scaler_std self.scaler = scaler
def mode_pca(): rows, head = load_rows() X, Y, x_head, train, test = rows_to_predictor_response(rows, head) pca = decomposition.PCA(n_components=13) re_pipeline = pipeline.Pipeline([ ('scaler',preprocessing.Scaler()), ('pca',pca) ]) pc = re_pipeline.fit_transform(X[train]) churn = Y[train,0] > 0.5 for i in range(1,13): plt.title('PCA scores') plt.xlabel('pc[0]') plt.ylabel('pc['+str(i)+']') plt.plot(pc[:,0], pc[:,i], 'go') plt.plot(pc[churn,0], pc[churn,i], 'ro') plt.savefig("out/churn_scores_0_"+str(i)+".png") plt.cla() loadings = pca.components_ for i in range(1,13): plt.title('PCA loadings') plt.xlabel('pc[0]') plt.ylabel('pc['+str(i)+']') plt.plot(loadings[0], loadings[i], 'go') for j,l in enumerate(loadings[[0,i]].T): plt.annotate(x_head[j],l); #-(j%3)*0.02 #print x_head[i],l plt.savefig("out/churn_loadings_0_"+str(i)+".png") plt.cla() y_scaler = preprocessing.Scaler(with_std=False) linre = linear_model.LinearRegression() linre.fit(X=pc, y=y_scaler.fit_transform(Y[train,0])) y_pred = y_scaler.inverse_transform( linre.predict( re_pipeline.transform(X[test]) ) ) plot_rocs('pca','b-',Y[test,0],y_pred)
def __prepare(self,features_lists): new_features_lists = [] for fl in features_lists: nfl = [float(fl[0]), float(fl[1]), float(fl[2]), float(fl[3]), float(fl[8]), float(fl[9]), float(fl[10]),float(fl[11])] new_features_lists.append(nfl) scaler = preprocessing.Scaler().fit(new_features_lists) print "mean", scaler.mean_ print "std", scaler.std_ self.scaler = scaler return scaler.transform(new_features_lists), scaler.mean_, scaler.std_
def train(docs, query2docs, label_map): scaler = preprocessing.Scaler().fit([extractFeatures(doc) for doc in docs]) # scaler.transform will standardize the data X_train = [] y_train = [] for query in query2docs: qdocs = query2docs[query] features = scaler.transform([extractFeatures(doc) for doc in qdocs]) for i, j in itertools.permutations(range(len(qdocs)), 2): doc_i = qdocs[i] doc_j = qdocs[j] label = cmp(label_map[doc_i.query][doc_i.url], label_map[doc_j.query][doc_j.url]) if label != 0: X_train.append(vec_difference(features[i], features[j])) y_train.append(label) model = svm.SVC(kernel='linear', C=3.0).fit(X_train, y_train) return scaler, model
def basic_gradient_descent(): digits = datasets.load_digits() # iris = datasets.load_iris() X = digits.images.reshape((digits.images.shape[0], -1)) scaler = pre.Scaler() X = scaler.fit_transform(X) y = ut.all_to_sparse(digits.target, max(digits.target) + 1) X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets( np.array(X), np.array(y), "basic_grad_descent_digits") X_val = np.vstack([X_val, X_test]) y_val = np.vstack([y_val, y_test]) thetas, costs, val_costs = neur.gradient_decent_gen( izip(neur.mini_batch_generator(X, 10), neur.mini_batch_generator(y, 10)), #hidden_layer_sz = 11, hidden_layer_sz=100, iter=1000, wd_coef=0.0, learning_rate=0.1, momentum_multiplier=0.9, rand_init_epsilon=0.012, do_early_stopping=True, #do_dropout = True, #dropout_percentage = 0.8, #do_learning_adapt = True, X_val=np.array(X_val), y_val=np.array(y_val)) h_x, a = neur.forward_prop(X_test, thetas) binary_result = ut.map_to_max_binary_result(h_x) print "percentage correct predictions: ", ut.percent_equal( binary_result, y_test) print "training error:", costs[-1:][0] print "validation error:", val_costs[-1:][0] print "lowest validation error:", min(val_costs) plt.plot(costs, label='cost') plt.plot(val_costs, label='val cost') plt.legend() plt.ylabel('error rate') plt.show()
def basic_gradient_descent(): digits = datasets.load_digits() # iris = datasets.load_iris() X = digits.images.reshape((digits.images.shape[0], -1)) scaler = pre.Scaler() X = scaler.fit_transform(X) y = ut.all_to_sparse(digits.target, max(digits.target) + 1) X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets( gpu.as_garray(X), gpu.as_garray(y), "digits") X_val = gpu.concatenate([X_val, X_test]) y_val = gpu.concatenate([y_val, y_test]) thetas, costs, val_costs = neur.gradient_decent( gpu.as_garray(X), gpu.as_garray(y), #hidden_layer_sz = 11, hidden_layer_sz=45, iter=500, wd_coef=0.0, learning_rate=0.25, momentum_multiplier=0.9, rand_init_epsilon=0.012, do_early_stopping=True, #do_dropout = True, dropout_percentage=0.7, #do_learning_adapt = True, X_val=gpu.as_garray(X_val), y_val=gpu.as_garray(y_val)) h_x, a = neur.forward_prop(X_test, thetas) h_x = map(lambda x: x.as_numpy_array(), h_x) print "percentage correct predictions: ", ut.percent_equal( ut.map_to_max_binary_result(h_x), y_test.as_numpy_array()) print "training error:", costs[-1:][0] print "validation error:", val_costs[-1:][0] print "lowest validation error:", min(val_costs) plt.plot(costs, label='cost') plt.plot(val_costs, label='val cost') plt.legend() plt.ylabel('error rate')
def main(): dat = pd.read_table('data/train_v2.csv', sep=',') print "reading done, train" loss = np.asarray(dat.loss) dat = dat.drop(['loss', 'id'], 1) dat['new1'] = dat['f528'] - dat['f527'] #golden feature 1 dat['new2'] = dat['f528'] - dat['f274'] #golden feature 2 dat = np.asarray(dat.values, dtype=float) col_med = stats.nanmedian(dat, axis=0) print "calculated medians, train" inds = np.where(np.isnan(dat)) dat[inds] = np.take(col_med, inds[1]) print "median imputation done, train" scaler = preprocessing.Scaler().fit(dat) dat = scaler.transform(dat) print "scaling done, train" labels = (loss > 0).astype(int) np.save('data/x_train.npy', dat) np.save('data/y_train.npy', labels) np.save('data/loss.npy', loss) print "trainset done" dat = pd.read_table('data/test_v2.csv', sep=',') print "reading done, test" ids = np.asarray(dat.id) dat = dat.drop(['id'], 1) dat['new1'] = dat['f528'] - dat['f527'] #golden feature 1 dat['new2'] = dat['f528'] - dat['f274'] #golden feature 2 dat = np.asarray(dat.values, dtype=float) col_med = stats.nanmedian(dat, axis=0) print "calculated medians, test" inds = np.where(np.isnan(dat)) dat[inds] = np.take(col_med, inds[1]) print "imputation done, test" dat = scaler.transform(dat) print "scaling done, test" np.save('data/x_test.npy', dat) np.save('data/ids.npy', ids) print "testset done"
def basic_iris(): iris = datasets.load_iris() scaler = pre.Scaler() X = scaler.fit_transform(iris.data) y = ut.all_to_sparse(iris.target, max(iris.target) + 1) X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets( np.array(X), np.array(y), "iris") X_val = np.vstack([X_val, X_test]) y_val = np.vstack([y_val, y_test]) thetas, costs, val_costs = neur.gradient_decent( np.array(X), np.array(y), #hidden_layer_sz = 11, hidden_layer_sz=20, iter=8000, wd_coef=0.0, learning_rate=0.07, momentum_multiplier=0.3, rand_init_epsilon=0.12, do_early_stopping=True, #do_dropout = True, dropout_percentage=0.9, do_learning_adapt=True, X_val=np.array(X_val), y_val=np.array(y_val)) h_x, a = neur.forward_prop(X_test, thetas) print "percentage correct predictions: ", ut.percent_equal( ut.map_to_max_binary_result(h_x), y_test) print "training error:", costs[-1:][0] print "validation error:", val_costs[-1:][0] print "lowest validation error:", min(val_costs) plt.plot(costs, label='cost') plt.plot(val_costs, label='val cost') plt.legend() plt.ylabel('error rate') plt.show()
def train_weak_classifier_motion(motion_train_set, adjective, feature_dictionary): ''' Takes the feature_object_train_set and trains the specified feature. Will return a single trained SVM ''' # Store SVM for each feature svm_store = dict() scaler_store = dict() import pdb pdb.set_trace() # For each feature set (pdc, pac, etc.) for feature in feature_dictionary: # Pull out the list of features feature_list = feature_dictionary[feature] # Pull out the features specified as a vector train_feature_vector, train_label_dict = utilities.feature_obj_2_feature_vector( motion_train_set, feature_list) # Create scaler scaler_store[feature] = preprocessing.Scaler().fit( train_feature_vector) train_feature_vector_scaled = scaler_store[feature].transform( train_feature_vector) # Train the SVM svm_store[feature] = train_svm(train_feature_vector_scaled, train_label_dict[1][adjective], train_label_dict[0]) return (svm_store, scaler_store)
def train_scaler(self, data): '''Work out the mean and variance of the samples''' from sklearn import preprocessing scaler = preprocessing.Scaler().fit(data.images) self.transform = scaler.transform
def basic_gradient_descent(): data = np.genfromtxt('./stack_data_wide_val.csv', delimiter=',') X = data[:, :-1] y = data[:, -1:] scaler = pre.Scaler() X_val = scaler.fit_transform(X) y_val = np.array(map(lambda x: [0, 1] if x == 0 else [1, 0], y.flatten())) #X, y, X_val, y_val, X_test, y_test = neur.cross_validation_sets(np.array(X), np.array(y), "basic_kaggle_data", True) #X_val = np.vstack([X_val, X_test]) #y_val = np.vstack([y_val, y_test]) hid_layer = 300 mg = neur.split_xy(neur.mini_batch_gen_from_file( 'stack_data_wide_train.csv', 40), -1, apply_x=lambda x: scaler.transform(x.astype(float)), apply_y=lambda y: np.array( map(lambda x: [0, 1] if x == 0 else [1, 0], y.flatten()))) #bm = rbm.RBM(13408, hid_layer) #costs = bm.optimize(neur.just_x(mg), 1000, 0.0007, val_set = X_val) #first_layer_weights = np.hstack([np.zeros((hid_layer,1)), bm.weights]) #thetas = neur.create_initial_thetas([64, hid_layer, 2], 0.12) #thetas[0] = first_layer_weights # best so far minibatchsize 40 hidden layer 100 learning rate 0.01 thetas, costs, val_costs = neur.gradient_decent_gen( mg, #hidden_layer_sz = 11, hidden_layer_sz=hid_layer, iter=20000, wd_coef=0.0, learning_rate=0.01, #thetas = thetas, momentum_multiplier=0.9, rand_init_epsilon=0.0012, do_early_stopping=True, #do_dropout = True, #dropout_percentage = 0.5, #do_learning_adapt = True, X_val=np.array(X_val), y_val=np.array(y_val)) h_x, a = neur.forward_prop(X_val, thetas) binary_result = ut.map_to_max_binary_result(h_x) print "percentage correct predictions: ", ut.percent_equal( binary_result, y_val) print "training error:", costs[-1:][0] print "validation error:", val_costs[-1:][0] print "lowest validation error:", min(val_costs) plt.plot(costs, label='cost') plt.plot(val_costs, label='val cost') plt.legend() plt.ylabel('error rate') plt.show()
def run_stack(SEED): model = "Long-Lat KNN5 - 50 Features" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("PreProcessData/training_PreProcess4_50.csv", skipFirstLine = False, split = "\t") test = csv_io.read_data("PreProcessData/test_PreProcess4_50.csv", skipFirstLine = False, split = "\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine = False) #random.seed(SEED) #random.shuffle(trainBase) avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166) ] # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print str(clf) avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) clf.fit(train, target) prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probSum += weights[test_index[i]][0] * math.fabs(targetTest[i] - probX) weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", probSum/weightSum avg += (probSum/weightSum)/NumFolds predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 dataset_blend_test[:,ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single("../predictions_50/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single("../predictions_50/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) csv_io.write_delimited_file("../predictions_40/RunLog.csv", [now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", ""], filemode="a",delimiter=",") print now print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
import pickle import numpy as np from scipy import interp import pylab as pl from sklearn import preprocessing as pps, svm from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import StratifiedKFold, LeaveOneOut with open('../data/svm_data.pkl', 'rb') as f: svm_data = pickle.load(f) labels = svm_data['labels'] data = svm_data['data'] scaler = pps.Scaler().fit(data) print "Mean: ", scaler.mean_ print "Std: ", scaler.std_ data_scaled = scaler.transform(data) classifier = svm.SVC(probability=True) classifier.fit(data_scaled, labels) #print "Support Vectors: \r\n", classifier.support_vectors_ print "SV's per class: \r\n", classifier.n_support_ ############################################################################### ## Code below modified from http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html#example-plot-roc-crossval-py X, y = data_scaled, np.array(labels) n_samples, n_features = X.shape
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--spec', help='training spec yaml file') parser.add_argument('--viz', action='store_true', help='just visualize') parser.add_argument('--no_normalize', action='store_false', help='do not normalize data by stdev') args = parser.parse_args() # load spec and feature value matrix spec = None with open(args.spec, 'r') as specf: spec = yaml.load(specf) feature_data = {} for feature_name in spec['features']: filename = osp.join(osp.dirname(args.spec), spec['features'][feature_name]['data']) with open(filename, 'r') as feature_data_file: feature_data[feature_name] = np.load(feature_data_file)['mat'] feature_names = sorted(feature_data.keys()) # create data points with labels LABEL_SAME = 1 LABEL_DIFFERENT = -1 # data has rows [feature_0 feature_1 label] npts = feature_data.values()[0].size data = np.zeros((npts, len(feature_names) + 1)) for fnum, feature_name in enumerate(feature_data): data[:, fnum] = feature_data[feature_name].reshape((-1, 1))[:, 0] labels = np.empty_like(feature_data.values()[0]) labels[:, :] = LABEL_DIFFERENT start = 0 for i, cls in enumerate(sorted(spec['classes'].keys())): num_in_cls = len(spec['classes'][cls]['examples']) labels[start:start + num_in_cls, start:start + num_in_cls] = LABEL_SAME start += num_in_cls data[:, -1] = labels.reshape((-1, 1))[:, 0] if args.viz: import pylab as pl pl.scatter(data[:, 0], data[:, 1], s=30, c=data[:, 2], cmap=pl.cm.Paired) pl.xlabel(feature_data.keys()[0]) pl.ylabel(feature_data.keys()[1]) pl.show() from sklearn import neighbors, datasets, linear_model, svm, pipeline, preprocessing classifiers = dict( knn=neighbors.KNeighborsClassifier(), logistic=linear_model.LogisticRegression(C=1e5), svm=svm.SVC(C=1e5, kernel='linear'), ) X = data[:, 0:2] Y = data[:, 2] trained = {} for name, clf in classifiers.iteritems(): pclf = pipeline.Pipeline([('scaler', preprocessing.Scaler()), ('classifier', clf)]) pclf.fit(X, Y) trained[name] = pclf filename = osp.join(osp.dirname(args.spec), 'classifiers.pkl') print 'Writing classifiers to', filename with open(filename, 'wb') as f: import cPickle cPickle.dump(trained, f) if args.viz: h = .01 # step size in the mesh import pylab as pl fignum = 1 for name, pclf in trained.iteritems(): # Plot the decision boundary. For that, we will asign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min(), X[:, 0].max() y_min, y_max = X[:, 1].min(), X[:, 1].max() xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = pclf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) #pl.figure(fignum, figsize=(6, 6)) pl.figure(fignum) pl.pcolormesh(xx, yy, Z, cmap=pl.cm.Paired) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired) pl.title(name) pl.xlabel(feature_data.keys()[0]) pl.ylabel(feature_data.keys()[1]) fignum += 1 pl.show()
# Get data data = fetch_sdss_sspp(cleaned=True) X = np.vstack([data['FeH'], data['alphFe']]).T # truncate dataset for speed X = X[::5] #------------------------------------------------------------ # Compute a 2D histogram of the input H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 50) #------------------------------------------------------------ # Compute the KMeans clustering n_clusters = 4 scaler = preprocessing.Scaler() clf = KMeans(n_clusters) clf.fit(scaler.fit_transform(X)) #------------------------------------------------------------ # Visualize the results fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot() # plot density ax = plt.axes() ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto', extent=[FeH_bins[0], FeH_bins[-1], alphFe_bins[0], alphFe_bins[-1]],
def PreProcess4(N_Features): trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList4.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = N_Features # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" term = 5000 # scaler has memory errors between 5000 and 10000 #term = len(trainBase) targetPre = [x[0] for x in trainBase][0:term] trainPre = [x[1:] for x in trainBase][0:term] #testPre = [x[0:] for x in test][0:term] targetPre = target[0:term] #print trainPre[term - 1] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) clf = GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166, min_samples_leaf=30) print "Training" clf.fit(trainScaled, targetPre) trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ DataClassListNew = [] for DataIndex, DataClass in enumerate(DataClassList): print DataClass[0], importances[DataIndex] DataClassListNew.append([DataClass[0], importances[DataIndex]]) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] print "Importance threshold: ", threshold rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess4_" + str(NumFeatures) + ".csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess4_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
def PreProcess4(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess2.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess2.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/DataClassList2.csv", "PreProcessData/DataClassList4Base.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList4Base.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = 40 # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True) clf = RandomForestRegressor(n_estimators=25, n_jobs=1, compute_importances=True) #clf = ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True) print "Training" # producing memory errors, probably too much data. # recommend to use linear lasso. #est = LinearRegression(fit_intercept=True, normalize=False, copy_X=True) #selector = RFE(est, 20, step=10) #selector = selector.fit(trainScaled, target) #print selector.support_ #print selector.ranking_ #return #trainPost = selector.transform(trainPre) #testPost = selector.transform(testPre) clf.fit(trainScaled, target) trainNew = [] testNew = [] print "Computing Importances" importances = clf.feature_importances_ DataClassListNew = [] for DataIndex, DataClass in enumerate(DataClassList): print DataClass[0], importances[DataIndex] DataClassListNew.append([DataClass[0], importances[DataIndex]]) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_Base.csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_sorted_Base.csv", DataClassListNew_temp) importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" if (len(importancesTemp) > NumFeatures): threshold = importancesTemp[NumFeatures] print "Importance threshold: ", threshold rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess4_Base.csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess4_Base.csv", testNew, delimiter="\t")
def PreProcess4(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList5_PCA.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList5_PCA.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) NumFeatures = 40 print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) testScaled = scaler.transform(testPre) #clf = RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini',compute_importances=True) #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) #clf = ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True,compute_importances=True) clf = PCA(n_components=NumFeatures) print "Training" # producing memory errors, probably too much data. # recommend to use linear lasso. #est = LinearRegression(fit_intercept=True, normalize=False, copy_X=True) #selector = RFE(est, 20, step=10) #selector = selector.fit(trainScaled, target) #print selector.support_ #print selector.ranking_ #return #trainPost = selector.transform(trainPre) #testPost = selector.transform(testPre) clf.fit(trainScaled, target) trainNew = [] testNew = [] print "Computing Importances" importances = clf.explained_variance_ratio_ #DataClassListNew = [] #for DataIndex, DataClass in enumerate(DataClassList): # print DataClass[0], importances[DataIndex]; # DataClassListNew.append([DataClass[0], importances[DataIndex]]) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_PCA.csv", importances) DataClassListNew_temp = sorted(importances, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_sorted_PCA.csv", DataClassListNew_temp) importancesTemp = sorted(importances, reverse=True) print len(importancesTemp), "importances" trainNew = clf.transform(trainScaled) testNew = clf.transform(testScaled) #if ( len(importancesTemp) > NumFeatures): # threshold = importancesTemp[NumFeatures] # print "Importance threshold: ", threshold # rowIndex = 0 # for row in train: # newRow = [] # for impIndex, importance in enumerate(importances): # if ( impIndex == 0): # newRow.append(target[rowIndex]) # if ( importance > threshold ): # newRow.append(row[impIndex]) # trainNew.append(newRow) # rowIndex += 1 # for row in test: # newRow = [] # for impIndex, importance in enumerate(importances): # if ( importance > threshold ) : # newRow.append(row[impIndex]) # testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess5_PCA.csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess5_PCA.csv", testNew, delimiter="\t")
pp = pprint.PrettyPrinter(indent=4) y_labels = open("y.txt").read().split("\n")[0:-1] x_labels = open("x_real.txt").read().split("\n")[0:-1] data_x = open("DATA_X", "r") X = cPickle.load(data_x) data_x.close() data_y = open("DATA_Y", "r") Y = cPickle.load(data_y) data_y.close() print "Scaling" scaler = preprocessing.Scaler().fit(X) X = scaler.transform(X) print "Normalising" normaliser = preprocessing.Normalizer().fit(X) X = normaliser.transform(X) print "Training" DELTAS = [] imax = COL_LIMIT if COL_LIMIT < Y.shape[1] else Y.shape[1] for i in range(imax): i = 53
def PreProcess5(): #note, 275 represents too much data, and the scaler fails with an exception. trainBase = csv_io.read_data("PreProcessData/training_PreProcess5_250.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess5_250.csv", skipFirstLine=False, split="\t") #shutil.copy2("PreProcessData/DataClassList5.csv", "PreProcessData/DataClassList6.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data( "PreProcessData/DataClassList_Importances_250.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = 40 # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" targetPre = [x[0] for x in trainBase][0:10000] print "Scaling1" trainPre = [x[1:] for x in trainBase][0:10000] #testPre = [x[0:] for x in test] print "Scaling2" scaler = preprocessing.Scaler().fit(trainPre) print "Scaling3" trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) #clf = RandomForestRegressor(n_estimators=25, n_jobs=1,compute_importances=True) #gc.collect() print "Prep Classes" # prep for usage below... DataClassListTemp = [] for DataIndex, DataClass in enumerate(DataClassList): DataClassListTemp.append([DataClass[0], 0]) DataClassList = DataClassListTemp reduceBy = 5 totalFeatures = len(trainPre[0]) trainNew = [] testNew = [] print "Processing" while (totalFeatures > NumFeatures): if (totalFeatures - NumFeatures < 40): reduceBy = 3 if (totalFeatures - NumFeatures < 20): reduceBy = 2 if (totalFeatures - NumFeatures < 10): reduceBy = 1 if (totalFeatures - NumFeatures < reduceBy): reduceBy = totalFeatures - NumFeatures print "Reduce Features: ", reduceBy print "Training" clf = GradientBoostingRegressor(loss='ls', learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166, min_samples_leaf=30) clf.fit(trainScaled, targetPre) print "Computing Importances" importances = clf.feature_importances_ #print importances importancesSorted = sorted(importances, reverse=True) #print importancesSorted threshold = importancesSorted[len(importancesSorted) - reduceBy] print threshold #trainScaled = clf.transform(trainScaled, threshold) # only exists in RF trainScaledNew = [] for row in trainScaled: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainScaledNew.append(newRow) trainScaled = trainScaledNew print "Cols:", len(trainScaled) print "Rows:", len(trainScaled[0]) totalFeatures = totalFeatures - reduceBy print "Total Features:", totalFeatures trainNew = [] testNew = [] for row in train: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) trainNew.append(newRow) train = trainNew for row in test: newRow = [] for impIndex, importance in enumerate(importances): if (importance > threshold): newRow.append(row[impIndex]) testNew.append(newRow) test = testNew print "Train Cols:", len(train) print "Train Rows:", len(train[0]) print "Test Cols:", len(test) print "Test Rows:", len(test[0]) DataClassListNew = [] for Index, importance in enumerate(importances): if (importance > threshold): print DataClassList[Index][0], importance DataClassListNew.append([DataClassList[Index][0], importance]) DataClassList = DataClassListNew print "Data Transform Complete" # final steps, save data classes in new set csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_RFE2_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted(DataClassListNew, key=operator.itemgetter(1), reverse=True) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_RFE2_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) # prepend the target on each row. trainFinal = [] rowIndex = 0 for row in train: newRow = [] for Index, val in enumerate(row): if (Index == 0): newRow.append(target[rowIndex]) newRow.append(val) trainFinal.append(newRow) rowIndex += 1 csv_io.write_delimited_file("PreProcessData/training_PreProcess6_RFE2_" + str(NumFeatures) + ".csv", trainFinal, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess6_RFE2_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")
def PreProcess4(): trainBase = csv_io.read_data("PreProcessData/training_PreProcess3.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess3.csv", skipFirstLine=False, split="\t") shutil.copy2("PreProcessData/DataClassList3.csv", "PreProcessData/DataClassList4.csv") target = [x[0] for x in trainBase] train = [x[1:] for x in trainBase] DataClassList = csv_io.read_data("PreProcessData/DataClassList4.csv", False) print "Data len: ", len(train[0]) print "DataClassList len: ", len(DataClassList) #return # this seems about optimal, but has not been tuned on latest improvements. NumFeatures = 40 # NOTE going from 30 to 20 features on KNN5 set has almost no effect. Down to 15 is significant loss. # for GBM at 6 and 400 30 is 3.01 and 30 3.05. print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) #testScaled = scaler.transform(testPre) clf = RandomForestRegressor(n_estimators=25, n_jobs=1, compute_importances=True) reduceBy = 5 clf.fit(trainScaled, target) print "Computing Importances" importances = clf.feature_importances_ print importances importancesSorted = sorted(importances, reverse=True) print importancesSorted threshold = importancesSorted[len(importancesSorted) - reduceBy] print threshold trainScaled = clf.transform(trainScaled, threshold) return trainNew = [] testNew = [] DataClassListNew = [] for DataIndex, DataClass in enumerate(DataClassList): print DataClass[0], selector.ranking_[DataIndex] DataClassListNew.append([DataClass[0], selector.ranking_[DataIndex]]) csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_RFE_" + str(NumFeatures) + ".csv", DataClassListNew) DataClassListNew_temp = sorted( DataClassListNew, key=operator.itemgetter(1)) # , reverse=True csv_io.write_delimited_file( "PreProcessData/DataClassList_Importances_RFE_sorted_" + str(NumFeatures) + ".csv", DataClassListNew_temp) #importancesTemp = sorted(importances, reverse=True) #print len(importancesTemp), "importances" if (len(selector.ranking_) > NumFeatures): #threshold = importancesTemp[NumFeatures] threshold = NumFeatures print "Importance threshold: ", threshold rowIndex = 0 for row in train: newRow = [] for impIndex, importance in enumerate(selector.ranking_): if (impIndex == 0): newRow.append(target[rowIndex]) if (importance < threshold): newRow.append(row[impIndex]) trainNew.append(newRow) rowIndex += 1 for row in test: newRow = [] for impIndex, importance in enumerate(selector.ranking_): if (importance < threshold): newRow.append(row[impIndex]) testNew.append(newRow) csv_io.write_delimited_file("PreProcessData/training_PreProcess4_RFE_" + str(NumFeatures) + ".csv", trainNew, delimiter="\t") csv_io.write_delimited_file("PreProcessData/test_PreProcess4_RFE_" + str(NumFeatures) + ".csv", testNew, delimiter="\t")