def test_cross_val_generator_mask_indices_same(): # Test that the cross validation generators return the same results when # indices=True and when indices=False y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]) labels = np.array([1, 1, 2, 3, 3, 3, 4]) loo_mask = cval.LeaveOneOut(5, indices=False) loo_ind = cval.LeaveOneOut(5, indices=True) lpo_mask = cval.LeavePOut(10, 2, indices=False) lpo_ind = cval.LeavePOut(10, 2, indices=True) kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1) kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1) skf_mask = cval.StratifiedKFold(y, 3, indices=False) skf_ind = cval.StratifiedKFold(y, 3, indices=True) lolo_mask = cval.LeaveOneLabelOut(labels, indices=False) lolo_ind = cval.LeaveOneLabelOut(labels, indices=True) lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False) lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True) for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind), (kf_mask, kf_ind), (skf_mask, skf_ind), (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]: for (train_mask, test_mask), (train_ind, test_ind) in \ zip(cv_mask, cv_ind): assert_array_equal(np.where(train_mask)[0], train_ind) assert_array_equal(np.where(test_mask)[0], test_ind)
def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling __iter__ labels = np.array([0, 1, 2, 1, 1, 2, 0, 0]) labels_changing = np.array(labels, copy=True) lolo = cval.LeaveOneLabelOut(labels) lolo_changing = cval.LeaveOneLabelOut(labels_changing) lplo = cval.LeavePLabelOut(labels, p=2) lplo_changing = cval.LeavePLabelOut(labels_changing, p=2) labels_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan)
def train_and_test_model(data, response, labels, model_type, split_by, c, impute=True, varname=""): """ train and test model of users based on given response variable """ model, type, model_string = models[model_type] if type == 'c': split = cross_validation.StratifiedShuffleSplit(response, 1, 0.2) else: #split = cross_validation.KFold(len(response), 5) #split = cross_validation.LeavePLabelOut(labels, 3) split = cross_validation.LeaveOneLabelOut(labels) predict = np.zeros(response.shape) for train, test in split: model.fit(data[train], response[train]) predict[test] = model.predict(data[test]) #print np.corrcoef(np.vstack((response[test], predict[test])))[0,1] plot_obs_pred(predict, response, "%s Model Performance" % model_string, varname) model.fit(data, response) return model
def test_cross_val_generator_with_mask(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=False) lpo = cval.LeavePOut(4, 2, indices=False) kf = cval.KFold(4, 2, indices=False) skf = cval.StratifiedKFold(y, 2, indices=False) lolo = cval.LeaveOneLabelOut(labels, indices=False) lopo = cval.LeavePLabelOut(labels, 2, indices=False) ss = cval.ShuffleSplit(4, indices=False) for cv in [loo, lpo, kf, skf, lolo, lopo, ss]: for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def test_cross_indices_exception(): X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]])) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4, indices=False) lpo = cval.LeavePOut(4, 2, indices=False) kf = cval.KFold(4, 2, indices=False) skf = cval.StratifiedKFold(y, 2, indices=False) lolo = cval.LeaveOneLabelOut(labels, indices=False) lopo = cval.LeavePLabelOut(labels, 2, indices=False) assert_raises(ValueError, cval.check_cv, loo, X, y) assert_raises(ValueError, cval.check_cv, lpo, X, y) assert_raises(ValueError, cval.check_cv, kf, X, y) assert_raises(ValueError, cval.check_cv, skf, X, y) assert_raises(ValueError, cval.check_cv, lolo, X, y) assert_raises(ValueError, cval.check_cv, lopo, X, y)
def classify(data, labels, args): classification = {} # Read model with open(args.model, "rb") as f: model = pickle.load(f) print_verbose( "Model [%0.2f%%]: %s" % (model.best_score_ * 100, str(model.best_estimator_)), 4) # Classify each label lolo = cross_validation.LeaveOneLabelOut(labels) print_verbose("LeaveOneOut: %s" % str(lolo), 5) for train_index, test_index in lolo: print_verbose("Test index: %s" % str(test_index), 5) print_verbose("Classifying label: %s" % str(labels[test_index[0]]), 4) # Classify if args.aggregation == 'mode': pred = model.predict(data[test_index]) else: pred = model.decision_function(data[test_index]) print_verbose("Patch prediction: %s" % str(pred), 4) # Aggregate if args.aggregation == 'mode': res = agg_pred_mode(pred) elif args.aggregation == 'sum': res = agg_pred_dist_sumall(pred, model.best_estimator_.classes_) elif args.aggregation == 'far': res = agg_pred_dist_far(pred, model.best_estimator_.classes_) elif args.aggregation == 'mean': res = agg_pred_dist_meangroup(pred, model.best_estimator_.classes_) elif args.aggregation == 'median': res = agg_pred_dist_mediangroup(pred, model.best_estimator_.classes_) print_verbose("Aggregate result: %s" % str(res), 4) # Append to final result classification[labels[test_index[0]]] = res print_verbose("Classification: %s" % str(classification), 5) return classification
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) ss = cval.ShuffleSplit(2) ps = cval.PredefinedSplit([1, 1, 2, 2]) for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X[train], X[test] y[train], y[test]
def test_cross_val_generator_with_default_indices(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = cval.LeaveOneOut(4) lpo = cval.LeavePOut(4, 2) kf = cval.KFold(4, 2) skf = cval.StratifiedKFold(y, 2) lolo = cval.LeaveOneLabelOut(labels) lopo = cval.LeavePLabelOut(labels, 2) b = cval.Bootstrap(2) # only in index mode ss = cval.ShuffleSplit(2) for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]: for train, test in cv: assert_not_equal(np.asarray(train).dtype.kind, 'b') assert_not_equal(np.asarray(train).dtype.kind, 'b') X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test]
def forward_selection_lodo(model, features, df, scoring_metric, ref_column, days_tr, n_feat, factor, cutoff): #initialize the best_features list with the base features to force their inclusion best_features = [] score_cv = [] RMSE = [] while len(features) > 0 and len(best_features) < n_feat: next_feature, score_cv_feat = forward_selection_step(model, best_features, features, df, ref_column, scoring_metric, days_tr, factor, cutoff) #add the next feature to the list best_features += [next_feature] MSE_feat = -np.mean(cross_val_score(model, df[best_features].values, df[ref_column].values, cv = cross_validation.LeaveOneLabelOut(days_tr), scoring = 'mean_squared_error')) RMSE_features = round(np.sqrt(MSE_feat), 1) score_cv.append((score_cv_feat)) RMSE.append(RMSE_features) print 'Next best Feature: ', next_feature, ',', 'Score: ', score_cv_feat, 'RMSE: ', RMSE_features, "#:", len(best_features) #remove the added feature from the list features.remove(next_feature) print "Best Features: ", best_features return best_features, score_cv, RMSE
def show_cross_val(method): if method == "lolo": labels = np.array(["summer", "winter", "summer", "winter", "spring"]) cv = cross_validation.LeaveOneLabelOut(labels) elif method == "lplo": labels = np.array(["summer", "winter", "summer", "winter", "spring"]) cv = cross_validation.LeavePLabelOut(labels, p=2) elif method == "loo": cv = cross_validation.LeaveOneOut(n=len(y)) elif method == "lpo": cv = cross_validation.LeavePOut(n=len(y), p=3) for train_index, test_index in cv: print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print "X_train:", X_train print "y_train:", y_train print "X_test:", X_test print "y_test:", y_test
def multi_regr_lol(self, delete): lol = cv.LeaveOneLabelOut(label) year = 2011 counter = 0 for train_index, test_index in lol: data = self.data[:] for remove_element in delete[counter]: del data[remove_element] model = self.model(data.ix[train_index, 0], data.ix[train_index, 1:]).fit() self.y_test_pred = pd.concat([self.y_test_pred, \ pd.DataFrame(model.predict(data.ix[test_index, 1:]))], ignore_index = True) predata_r2_adj = r2_adj_score(data.ix[train_index,0], \ pd.DataFrame(model.predict()), len(data.columns[1:])) predata_rmse = root_mean_squared_error(data.ix[train_index, 0], \ pd.DataFrame(model.predict())) rmse = root_mean_squared_error(data.ix[test_index, 0], self.y_test_pred.ix[test_index]) r2_adj = r2_adj_score(data.ix[test_index, 0], self.y_test_pred.ix[test_index], \ len(data.columns[1:])) print(model.summary()) print(year+counter*2,"年:", r2_adj, rmse, )#end = "\n\n\n") print(year+counter*2,"年の訓練データへのあてはまり:", predata_r2_adj, predata_rmse, end = "\n\n\n") #t値が大きいものから3つの変数をプロット #print(model.tvalues) tvalues = pd.DataFrame(model.tvalues, columns=['t_value']) tvalues = tvalues.sort_values(by='t_value', ascending=False) #print(tvalues, type(tvalues)) tvalue = tvalues[:3] print(tvalue) sns.set(font='IPAPGothic') sns.barplot(x=tvalue.index, y='t_value', data=tvalue, palette='autumn') sns.plt.ylabel("t 値", fontsize=20) sns.plt.xlabel("") sns.plt.tick_params(labelsize=18) sns.plt.savefig(data.columns[0]+'_'+str(year+counter*2)+'.pdf') sns.plt.show() counter += 1
def main(input_file, adjective_file, train_feature_pkl, test_feature_pkl, ensemble_test_feature_pkl, all_classifiers_pkl, scaler_pkl, bolt_feature_obj_pkl): # Load data into the pipeline. First check # for feature object pkl files print "Loading data from file\n" # if train_feature_pkl == None or test_feature_pkl == None or ensemble_test_feature_pkl == None: if bolt_feature_obj_pkl == None: # If no features, load data from either an # h5 and adjective file or directly from # a saved pkl file if input_file.endswith(".h5"): all_data = loadDataFromH5File(input_file, adjective_file) else: all_data = utilities.loadBoltObjFile(input_file) print "Loaded data\n" """ # Remove the duplicated MDF_320, and save a new all_data.pkl all_data_new = dict() toremove = [290, 291, 292, 293, 294, 295, 296, 297, 298, 299] for motion_name in all_data: all_data_new[motion_name] = np.delete(all_data[motion_name], toremove) cPickle.dump(all_data_new, open("all_data.pkl", "w"), cPickle.HIGHEST_PROTOCOL) import pdb; pdb.set_trace() pass """ # Split the data by leaving one object out as ensemble_test_data for each time and cycle through all objects # Generate the stratifications(labels) for picking out a object obj_id_vector = [] for num in np.arange(len(all_data['tap'])): obj_id_vector.append(all_data['tap'][num].object_id) lol = cross_validation.LeaveOneLabelOut(np.array(obj_id_vector)) obj_id_list = np.unique(obj_id_vector).tolist() # We may pickle this cross validation generator "lol" later train_set = dict() ensemble_test_set = dict() for train_index, test_index in lol: print "TRAIN_INDEX: %s TEST_INDEX: %s" % (train_index, test_index) train_data = dict() ensemble_test_data = dict() for motion_name in all_data: train_data_array = np.array(all_data[motion_name])[train_index] ensemble_test_data_array = np.array( all_data[motion_name])[test_index] obj_id = ensemble_test_data_array[0].object_id train_data[motion_name] = train_data_array.tolist() ensemble_test_data[ motion_name] = ensemble_test_data_array.tolist() train_set[obj_id] = train_data ensemble_test_set[obj_id] = ensemble_test_data #cPickle.dump(train_data, open("train_data_"+str(obj_id)+".pkl", "w"), cPickle.HIGHEST_PROTOCOL) #cPickle.dump(ensemble_test_data,open("ensemble_test_data_"+%(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) #cPickle.dump(train_set, open("train_set.pkl", "w"), cPickle.HIGHEST_PROTOCOL)) #cPickle.dump(ensemble_test_set, open("ensemble_test_set.pkl"), "w", cPickle.HIGHEST_PROTOCOL)) # Split the data into train and final test # train_data, ensemble_test_data = utilities.split_data(all_data, 0.9) for obj_id in train_set: # Split the train data again into train and test train_data, test_data = utilities.split_data( train_set[obj_id], 0.7) # Fit PCA for electrodes on training data print "Fitting PCA for electrode data\n" electrode_pca_dict = fit_electrodes_pca(train_data) # Store off PCA pkl cPickle.dump(electrode_pca_dict, open("pca_pkls/pca_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "PCA transforms stored as 'pca.pkl'\n" # Convert motion objects into feature objects print "Generating feature object dictionaries\n" train_all_features_obj_dict = BoltMotionObjToFeatureObj( train_data, electrode_pca_dict) test_all_features_obj_dict = BoltMotionObjToFeatureObj( test_data, electrode_pca_dict) ensemble_test_all_features_obj_dict = BoltMotionObjToFeatureObj( ensemble_test_data, electrode_pca_dict) # Store off feature object pkls cPickle.dump( train_all_features_obj_dict, open("train_pkls/train_feature_objs_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'train_feature_objs.pkl'\n" cPickle.dump( test_all_features_obj_dict, open("test_pkls/test_feature_objs_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'test_feature_objs.pkl'\n" cPickle.dump( ensemble_test_all_features_obj_dict, open( "ensemble_pkls/ensemble_test_feature_objs_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'ensemble_test_feature_objs.pkl'\n" import pdb pdb.set_trace() pass else: # Load pkl'd feature object dictionaries all_feature_obj_dict = cPickle.load(open(bolt_feature_obj_pkl, "r")) ''' train_all_features_obj_dict = cPickle.load(open(train_feature_pkl,"r")) test_all_features_obj_dict = cPickle.load(open(test_feature_pkl,"r")) ensemble_test_all_features_obj_dict = cPickle.load(open(ensemble_test_feature_pkl,"r")) ''' print "Loaded data\n" # 1st split: pick out 5 objects for final testing obj_leave_out = [101, 316, 702, 508, 601] five_test_feature_obj, all_train_feature_obj = PickOutObjects( all_feature_obj_dict, obj_leave_out) # 2nd split: pick 6 objects out for testing kNN/SVM classifiers and creating proba test_obj_leave_out = [315, 602, 115, 216, 213, 309] ensemble_train_feature_obj, train_feature_obj = PickOutObjects( all_train_feature_obj, test_obj_leave_out) # Specify feature to be extracted feature_name_list = [ "pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc", "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit", "gripper_min", "gripper_mean", "transform_distance", "electrode_polyfit" ] if all_classifiers_pkl == None or scaler_pkl == None: # Pull desired features from feature objects train_feature_vector_dict, train_adjective_dict = bolt_obj_2_feature_vector( train_feature_obj, feature_name_list) test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector( ensemble_train_feature_obj, feature_name_list) print("Created feature vector containing %s\n" % feature_name_list) # Create Scalers scaler_dict = create_scalers(train_feature_vector_dict) # Store off scaler dictionary cPickle.dump(scaler_dict, open("scaler.pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature vector scalers stored as 'scaler.pkl'\n" # Run full train #all_knn_classifiers, all_svm_classifiers = full_train(train_feature_vector_dict, train_adjective_dict, test_feature_vector_dict, test_adjective_dict, scaler_dict) import pdb pdb.set_trace() pass # Select which algorithm to use in the ensemble phase all_classifiers_dict = all_svm_classifiers else: # Load pkl'd classifiers, probabilities and scores all_classifiers_dict = cPickle.load(open(all_classifiers_pkl, "r")) # Load pkl'd scaler dictionary scaler_dict = cPickle.load(open(scaler_pkl, "r")) # Get test labels, to be used as ensemble train labels test_all_features_obj_dict = cPickle.load(open(test_feature_pkl, "r")) test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector( test_all_features_obj_dict, feature_name_list) # Pull desired bolt features from ensemble test data ensemble_test_feature_vector_dict, ensemble_test_adjective_dict = bolt_obj_2_feature_vector( five_test_feature_obj_dict, feature_name_list) # Create ensemble feature vectors out of probabilities ensemble_train_feature_vector_dict, ensemble_test_feature_vector_dict = extract_ensemble_features( all_classifiers_dict, ensemble_test_feature_vector_dict, ensemble_test_adjective_dict, scaler_dict) # Ensemble train labels are previous test labels ensemble_train_adjective_dict = test_adjective_dict import pdb pdb.set_trace() for adj in ensemble_train_adjective_dict: count = np.sum(ensemble_train_adjective_dict[adj]) import pdb pdb.set_trace() print adj + ": %d " % count # Remove the adjectives 'warm' and 'sparse' from the labels dictionaries del ensemble_train_adjective_dict['springy'] del ensemble_test_adjective_dict['springy'] del ensemble_train_adjective_dict['elastic'] del ensemble_test_adjective_dict['elastic'] del ensemble_train_adjective_dict['meshy'] del ensemble_test_adjective_dict['meshy'] del ensemble_train_adjective_dict['gritty'] del ensemble_test_adjective_dict['gritty'] del ensemble_train_adjective_dict['textured'] del ensemble_test_adjective_dict['textured'] del ensemble_train_adjective_dict['absorbant'] del ensemble_test_adjective_dict['absorbant'] del ensemble_train_adjective_dict['crinkly'] del ensemble_test_adjective_dict['crinkly'] del ensemble_train_adjective_dict['porous'] del ensemble_test_adjective_dict['porous'] del ensemble_train_adjective_dict['grainy'] del ensemble_test_adjective_dict['grainy'] del ensemble_train_adjective_dict['warm'] del ensemble_test_adjective_dict['warm'] del ensemble_train_adjective_dict['sparse'] del ensemble_test_adjective_dict['sparse'] # Combine motion-specific classifiers for each adjective all_ensemble_classifiers = full_ensemble_train( ensemble_train_feature_vector_dict, ensemble_train_adjective_dict, ensemble_test_feature_vector_dict, ensemble_test_adjective_dict) # Store off combined classifiers cPickle.dump(all_ensemble_classifiers, open("all_ensemble_classifiers.pkl", "w"), cPickle.HIGHEST_PROTOCOL)
import numpy as np from sklearn.model_selection import LeaveOneOut from sklearn.model_selection import LeavePOut X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 3, 4]) kf = KFold(n_splits=2) kf.get_n_splits(X) print(kf) for train_index, test_index in kf.split(X): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] print("X_train:", X_train, "X_test:", X_test, "y_train:", y_train, "y_test:", y_test, "\n") X = [1, 2, 3, 4] loo = LeaveOneOut() for train, test in loo.split(X): print("%s%s" % (train, test)) X = np.ones(4) lpo = LeavePOut(p=1) for train, test in lpo.split(X): print("%s%s" % (train, test)) labels = [1, 1, 2, 2, 3, 3] lolo = cross_validation.LeaveOneLabelOut(labels) for train, test in lolo: print("%s%s" % (train, test))
def create_custom_cv(df): labels = df['chunk'].unique() lol = cross_validation.LeaveOneLabelOut(labels)
test_data[f] = 0.0 test_as_records = test_data[FEATURES].to_dict('records') Xt = dv.transform(test_as_records) Xt = Xt.todense() for key in VECTOR_FEATURES: Xt = np.concatenate([Xt, np.array(list(test_data[key]))], axis=1) Xt[np.isnan(Xt)] = 0.0 scaler = sklearn.preprocessing.MinMaxScaler() X = scaler.fit_transform(X) Xt = scaler.transform(Xt) predict_test("%s/%s" % (ARGS.output, basename(TEST_FILE)), X, Y, Xt, original_test, test_data) # time for actual machine learning # need to make sure we don't repeat sick id's across training/test sickIDs = np.array(list(set(data["pairIndex"]))) # randomize np.random.seed(RANDOM_SEED) np.random.shuffle(sickIDs) # count 'em off like assigning groups in class splitLookup = dict(izip(sickIDs, cycle(xrange(NUM_CROSS_VAL)))) # look up each group number to determine the fold skf = cross_validation.LeaveOneLabelOut( [splitLookup[sickID] for sickID in data['pairIndex']]) run_crossval("%s/%s" % (ARGS.output, basename(DATA_FILE)), X, Y, skf, original_data, data)
data_co = pd.read_csv("data_co.csv", delimiter=',') label = data_au['year'] random_forest_au = RFCV(data_au) random_forest_co = RFCV(data_co) """ if sys.argv[1] == "LOO": kf = cv.KFold(n = len(data_au), n_folds = len(data_au)) pred_au = random_forest_au.random_forest_cv(kf, 1000, 5) pred_co = random_forest_co.random_forest_cv(kf, 1000, 5) """ if sys.argv[1] == "LOL": lol = cv.LeaveOneLabelOut(label) print("=" * 10, "audience", "=" * 10) pred_au = random_forest_au.random_forest_cv(lol, 600, 6) print("=" * 10, "congestion rate", "=" * 10) pred_co = random_forest_co.random_forest_cv(lol, 500, 4) """ print("=" * 10 + "audience を回帰" + "=" * 10) print("RMSE : ", random_forest_au.root_mean_squared_error()) print("adj.r2 : ", random_forest_au.r2_adj_score()) print("=" * 10 + "congestion rate を回帰" + "=" * 10) print("RMSE : ", random_forest_co.root_mean_squared_error()) print("adj.r2 : ", random_forest_co.r2_adj_score()) #残差プロット