예제 #1
0
def test_cross_val_generator_mask_indices_same():
    # Test that the cross validation generators return the same results when
    # indices=True and when indices=False
    y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2])
    labels = np.array([1, 1, 2, 3, 3, 3, 4])

    loo_mask = cval.LeaveOneOut(5, indices=False)
    loo_ind = cval.LeaveOneOut(5, indices=True)
    lpo_mask = cval.LeavePOut(10, 2, indices=False)
    lpo_ind = cval.LeavePOut(10, 2, indices=True)
    kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1)
    kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1)
    skf_mask = cval.StratifiedKFold(y, 3, indices=False)
    skf_ind = cval.StratifiedKFold(y, 3, indices=True)
    lolo_mask = cval.LeaveOneLabelOut(labels, indices=False)
    lolo_ind = cval.LeaveOneLabelOut(labels, indices=True)
    lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False)
    lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True)

    for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind),
                            (kf_mask, kf_ind), (skf_mask, skf_ind),
                            (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]:
        for (train_mask, test_mask), (train_ind, test_ind) in \
                zip(cv_mask, cv_ind):
            assert_array_equal(np.where(train_mask)[0], train_ind)
            assert_array_equal(np.where(test_mask)[0], test_ind)
예제 #2
0
def test_leave_label_out_changing_labels():
    # Check that LeaveOneLabelOut and LeavePLabelOut work normally if
    # the labels variable is changed before calling __iter__
    labels = np.array([0, 1, 2, 1, 1, 2, 0, 0])
    labels_changing = np.array(labels, copy=True)
    lolo = cval.LeaveOneLabelOut(labels)
    lolo_changing = cval.LeaveOneLabelOut(labels_changing)
    lplo = cval.LeavePLabelOut(labels, p=2)
    lplo_changing = cval.LeavePLabelOut(labels_changing, p=2)
    labels_changing[:] = 0
    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
            assert_array_equal(train, train_chan)
            assert_array_equal(test, test_chan)
예제 #3
0
def train_and_test_model(data,
                         response,
                         labels,
                         model_type,
                         split_by,
                         c,
                         impute=True,
                         varname=""):
    """ train and test model of users based on given response variable """
    model, type, model_string = models[model_type]
    if type == 'c':
        split = cross_validation.StratifiedShuffleSplit(response, 1, 0.2)
    else:
        #split = cross_validation.KFold(len(response), 5)
        #split = cross_validation.LeavePLabelOut(labels, 3)
        split = cross_validation.LeaveOneLabelOut(labels)
    predict = np.zeros(response.shape)
    for train, test in split:
        model.fit(data[train], response[train])
        predict[test] = model.predict(data[test])
        #print np.corrcoef(np.vstack((response[test], predict[test])))[0,1]
    plot_obs_pred(predict, response, "%s Model Performance" % model_string,
                  varname)
    model.fit(data, response)
    return model
def test_cross_val_generator_with_mask():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)
    ss = cval.ShuffleSplit(4, indices=False)
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss]:
        for train, test in cv:
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
예제 #5
0
def test_cross_indices_exception():
    X = coo_matrix(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]))
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=False)
    lpo = cval.LeavePOut(4, 2, indices=False)
    kf = cval.KFold(4, 2, indices=False)
    skf = cval.StratifiedKFold(y, 2, indices=False)
    lolo = cval.LeaveOneLabelOut(labels, indices=False)
    lopo = cval.LeavePLabelOut(labels, 2, indices=False)

    assert_raises(ValueError, cval.check_cv, loo, X, y)
    assert_raises(ValueError, cval.check_cv, lpo, X, y)
    assert_raises(ValueError, cval.check_cv, kf, X, y)
    assert_raises(ValueError, cval.check_cv, skf, X, y)
    assert_raises(ValueError, cval.check_cv, lolo, X, y)
    assert_raises(ValueError, cval.check_cv, lopo, X, y)
예제 #6
0
파일: classify.py 프로젝트: shrubaG/vangogh
def classify(data, labels, args):

    classification = {}

    # Read model
    with open(args.model, "rb") as f:
        model = pickle.load(f)
    print_verbose(
        "Model [%0.2f%%]: %s" %
        (model.best_score_ * 100, str(model.best_estimator_)), 4)

    # Classify each label
    lolo = cross_validation.LeaveOneLabelOut(labels)
    print_verbose("LeaveOneOut: %s" % str(lolo), 5)

    for train_index, test_index in lolo:
        print_verbose("Test index: %s" % str(test_index), 5)
        print_verbose("Classifying label: %s" % str(labels[test_index[0]]), 4)

        # Classify
        if args.aggregation == 'mode':
            pred = model.predict(data[test_index])
        else:
            pred = model.decision_function(data[test_index])
        print_verbose("Patch prediction: %s" % str(pred), 4)

        # Aggregate
        if args.aggregation == 'mode':
            res = agg_pred_mode(pred)
        elif args.aggregation == 'sum':
            res = agg_pred_dist_sumall(pred, model.best_estimator_.classes_)
        elif args.aggregation == 'far':
            res = agg_pred_dist_far(pred, model.best_estimator_.classes_)
        elif args.aggregation == 'mean':
            res = agg_pred_dist_meangroup(pred, model.best_estimator_.classes_)
        elif args.aggregation == 'median':
            res = agg_pred_dist_mediangroup(pred,
                                            model.best_estimator_.classes_)
        print_verbose("Aggregate result: %s" % str(res), 4)

        # Append to final result
        classification[labels[test_index[0]]] = res
        print_verbose("Classification: %s" % str(classification), 5)

    return classification
예제 #7
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    ss = cval.ShuffleSplit(2)
    ps = cval.PredefinedSplit([1, 1, 2, 2])
    for cv in [loo, lpo, kf, skf, lolo, lopo, ss, ps]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X[train], X[test]
            y[train], y[test]
예제 #8
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
예제 #9
0
def forward_selection_lodo(model, features, df, scoring_metric, ref_column, days_tr, n_feat, factor, cutoff):
    #initialize the best_features list with the base features to force their inclusion
    best_features = []
    score_cv = []
    RMSE = []
    while len(features) > 0 and len(best_features) < n_feat:
        next_feature, score_cv_feat = forward_selection_step(model, best_features, features, df, ref_column, scoring_metric, days_tr, factor, cutoff)
        #add the next feature to the list
        best_features += [next_feature]
        MSE_feat = -np.mean(cross_val_score(model, df[best_features].values, df[ref_column].values,
            cv = cross_validation.LeaveOneLabelOut(days_tr), scoring = 'mean_squared_error'))
        RMSE_features = round(np.sqrt(MSE_feat), 1)
        score_cv.append((score_cv_feat))
        RMSE.append(RMSE_features)
        print 'Next best Feature: ', next_feature, ',', 'Score: ', score_cv_feat, 'RMSE: ', RMSE_features, "#:", len(best_features)
        #remove the added feature from the list
        features.remove(next_feature)
    print "Best Features: ", best_features
    return best_features, score_cv, RMSE
예제 #10
0
def show_cross_val(method):
    if method == "lolo":
        labels = np.array(["summer", "winter", "summer", "winter", "spring"])
        cv = cross_validation.LeaveOneLabelOut(labels)
    elif method == "lplo":
        labels = np.array(["summer", "winter", "summer", "winter", "spring"])
        cv = cross_validation.LeavePLabelOut(labels, p=2)
    elif method == "loo":
        cv = cross_validation.LeaveOneOut(n=len(y))
    elif method == "lpo":
        cv = cross_validation.LeavePOut(n=len(y), p=3)
    for train_index, test_index in cv:
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print "X_train:", X_train
        print "y_train:", y_train
        print "X_test:", X_test
        print "y_test:", y_test
예제 #11
0
	def multi_regr_lol(self, delete):
		lol = cv.LeaveOneLabelOut(label)

		year = 2011
		counter = 0
		for train_index, test_index in lol:
			data = self.data[:]
			for remove_element in delete[counter]:
				del data[remove_element]

			model = self.model(data.ix[train_index, 0], data.ix[train_index, 1:]).fit()
			self.y_test_pred = pd.concat([self.y_test_pred, \
				pd.DataFrame(model.predict(data.ix[test_index, 1:]))], ignore_index = True)

			predata_r2_adj = r2_adj_score(data.ix[train_index,0], \
				pd.DataFrame(model.predict()), len(data.columns[1:]))
			predata_rmse = root_mean_squared_error(data.ix[train_index, 0], \
				pd.DataFrame(model.predict()))
			rmse = root_mean_squared_error(data.ix[test_index, 0], self.y_test_pred.ix[test_index])
			r2_adj = r2_adj_score(data.ix[test_index, 0], self.y_test_pred.ix[test_index], \
				len(data.columns[1:]))
			print(model.summary())
			print(year+counter*2,"年:", r2_adj, rmse, )#end = "\n\n\n")
			print(year+counter*2,"年の訓練データへのあてはまり:", predata_r2_adj, predata_rmse, end = "\n\n\n")

			#t値が大きいものから3つの変数をプロット
			#print(model.tvalues)
			tvalues = pd.DataFrame(model.tvalues, columns=['t_value'])
			tvalues = tvalues.sort_values(by='t_value', ascending=False)
			#print(tvalues, type(tvalues))
			tvalue = tvalues[:3]
			print(tvalue)
			sns.set(font='IPAPGothic')
			sns.barplot(x=tvalue.index, y='t_value', data=tvalue, palette='autumn')
			sns.plt.ylabel("t 値", fontsize=20)
			sns.plt.xlabel("")
			sns.plt.tick_params(labelsize=18)
			sns.plt.savefig(data.columns[0]+'_'+str(year+counter*2)+'.pdf')
			sns.plt.show()

			counter += 1
def main(input_file, adjective_file, train_feature_pkl, test_feature_pkl,
         ensemble_test_feature_pkl, all_classifiers_pkl, scaler_pkl,
         bolt_feature_obj_pkl):

    # Load data into the pipeline. First check
    # for feature object pkl files
    print "Loading data from file\n"
    # if train_feature_pkl == None or test_feature_pkl == None or ensemble_test_feature_pkl == None:
    if bolt_feature_obj_pkl == None:
        # If no features, load data from either an
        # h5 and adjective file or directly from
        # a saved pkl file
        if input_file.endswith(".h5"):
            all_data = loadDataFromH5File(input_file, adjective_file)
        else:
            all_data = utilities.loadBoltObjFile(input_file)

        print "Loaded data\n"
        """ 
        # Remove the duplicated MDF_320, and save a new all_data.pkl
        all_data_new = dict()
        toremove = [290, 291, 292, 293, 294, 295, 296, 297, 298, 299]
        for motion_name in all_data:
            all_data_new[motion_name] = np.delete(all_data[motion_name], toremove)

        cPickle.dump(all_data_new, open("all_data.pkl", "w"), cPickle.HIGHEST_PROTOCOL)

        import pdb; pdb.set_trace()
        pass
        """

        # Split the data by leaving one object out as ensemble_test_data for each time and cycle through all objects

        # Generate the stratifications(labels) for picking out a object
        obj_id_vector = []
        for num in np.arange(len(all_data['tap'])):
            obj_id_vector.append(all_data['tap'][num].object_id)

        lol = cross_validation.LeaveOneLabelOut(np.array(obj_id_vector))
        obj_id_list = np.unique(obj_id_vector).tolist()
        # We may pickle this cross validation generator "lol" later

        train_set = dict()
        ensemble_test_set = dict()
        for train_index, test_index in lol:
            print "TRAIN_INDEX: %s  TEST_INDEX: %s" % (train_index, test_index)
            train_data = dict()
            ensemble_test_data = dict()
            for motion_name in all_data:
                train_data_array = np.array(all_data[motion_name])[train_index]
                ensemble_test_data_array = np.array(
                    all_data[motion_name])[test_index]
                obj_id = ensemble_test_data_array[0].object_id

                train_data[motion_name] = train_data_array.tolist()
                ensemble_test_data[
                    motion_name] = ensemble_test_data_array.tolist()

            train_set[obj_id] = train_data
            ensemble_test_set[obj_id] = ensemble_test_data

            #cPickle.dump(train_data, open("train_data_"+str(obj_id)+".pkl", "w"), cPickle.HIGHEST_PROTOCOL)
            #cPickle.dump(ensemble_test_data,open("ensemble_test_data_"+%(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL)

        #cPickle.dump(train_set, open("train_set.pkl", "w"), cPickle.HIGHEST_PROTOCOL))
        #cPickle.dump(ensemble_test_set, open("ensemble_test_set.pkl"), "w", cPickle.HIGHEST_PROTOCOL))

        # Split the data into train and final test
        # train_data, ensemble_test_data = utilities.split_data(all_data, 0.9)

        for obj_id in train_set:
            # Split the train data again into train and test
            train_data, test_data = utilities.split_data(
                train_set[obj_id], 0.7)

            # Fit PCA for electrodes on training data
            print "Fitting PCA for electrode data\n"
            electrode_pca_dict = fit_electrodes_pca(train_data)

            # Store off PCA pkl
            cPickle.dump(electrode_pca_dict,
                         open("pca_pkls/pca_" + str(obj_id) + ".pkl", "w"),
                         cPickle.HIGHEST_PROTOCOL)
            print "PCA transforms stored as 'pca.pkl'\n"

            # Convert motion objects into feature objects
            print "Generating feature object dictionaries\n"
            train_all_features_obj_dict = BoltMotionObjToFeatureObj(
                train_data, electrode_pca_dict)
            test_all_features_obj_dict = BoltMotionObjToFeatureObj(
                test_data, electrode_pca_dict)
            ensemble_test_all_features_obj_dict = BoltMotionObjToFeatureObj(
                ensemble_test_data, electrode_pca_dict)

            # Store off feature object pkls
            cPickle.dump(
                train_all_features_obj_dict,
                open("train_pkls/train_feature_objs_" + str(obj_id) + ".pkl",
                     "w"), cPickle.HIGHEST_PROTOCOL)
            print "Feature object dictionary stored as 'train_feature_objs.pkl'\n"
            cPickle.dump(
                test_all_features_obj_dict,
                open("test_pkls/test_feature_objs_" + str(obj_id) + ".pkl",
                     "w"), cPickle.HIGHEST_PROTOCOL)
            print "Feature object dictionary stored as 'test_feature_objs.pkl'\n"
            cPickle.dump(
                ensemble_test_all_features_obj_dict,
                open(
                    "ensemble_pkls/ensemble_test_feature_objs_" + str(obj_id) +
                    ".pkl", "w"), cPickle.HIGHEST_PROTOCOL)
            print "Feature object dictionary stored as 'ensemble_test_feature_objs.pkl'\n"

        import pdb
        pdb.set_trace()
        pass

    else:
        # Load pkl'd feature object dictionaries
        all_feature_obj_dict = cPickle.load(open(bolt_feature_obj_pkl, "r"))
        '''
        train_all_features_obj_dict = cPickle.load(open(train_feature_pkl,"r"))
        test_all_features_obj_dict = cPickle.load(open(test_feature_pkl,"r"))
        ensemble_test_all_features_obj_dict = cPickle.load(open(ensemble_test_feature_pkl,"r"))
        '''

        print "Loaded data\n"

    # 1st split: pick out 5 objects for final testing
    obj_leave_out = [101, 316, 702, 508, 601]
    five_test_feature_obj, all_train_feature_obj = PickOutObjects(
        all_feature_obj_dict, obj_leave_out)

    # 2nd split: pick 6 objects out for testing kNN/SVM classifiers and creating proba
    test_obj_leave_out = [315, 602, 115, 216, 213, 309]
    ensemble_train_feature_obj, train_feature_obj = PickOutObjects(
        all_train_feature_obj, test_obj_leave_out)

    # Specify feature to be extracted
    feature_name_list = [
        "pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc",
        "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit", "gripper_min",
        "gripper_mean", "transform_distance", "electrode_polyfit"
    ]

    if all_classifiers_pkl == None or scaler_pkl == None:

        # Pull desired features from feature objects
        train_feature_vector_dict, train_adjective_dict = bolt_obj_2_feature_vector(
            train_feature_obj, feature_name_list)
        test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector(
            ensemble_train_feature_obj, feature_name_list)
        print("Created feature vector containing %s\n" % feature_name_list)

        # Create Scalers
        scaler_dict = create_scalers(train_feature_vector_dict)

        # Store off scaler dictionary
        cPickle.dump(scaler_dict, open("scaler.pkl", "w"),
                     cPickle.HIGHEST_PROTOCOL)
        print "Feature vector scalers stored as 'scaler.pkl'\n"

        # Run full train
        #all_knn_classifiers,
        all_svm_classifiers = full_train(train_feature_vector_dict,
                                         train_adjective_dict,
                                         test_feature_vector_dict,
                                         test_adjective_dict, scaler_dict)

        import pdb
        pdb.set_trace()
        pass

        # Select which algorithm to use in the ensemble phase
        all_classifiers_dict = all_svm_classifiers

    else:
        # Load pkl'd classifiers, probabilities and scores
        all_classifiers_dict = cPickle.load(open(all_classifiers_pkl, "r"))

        # Load pkl'd scaler dictionary
        scaler_dict = cPickle.load(open(scaler_pkl, "r"))

        # Get test labels, to be used as ensemble train labels
        test_all_features_obj_dict = cPickle.load(open(test_feature_pkl, "r"))
        test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector(
            test_all_features_obj_dict, feature_name_list)

    # Pull desired bolt features from ensemble test data
    ensemble_test_feature_vector_dict, ensemble_test_adjective_dict = bolt_obj_2_feature_vector(
        five_test_feature_obj_dict, feature_name_list)

    # Create ensemble feature vectors out of probabilities
    ensemble_train_feature_vector_dict, ensemble_test_feature_vector_dict = extract_ensemble_features(
        all_classifiers_dict, ensemble_test_feature_vector_dict,
        ensemble_test_adjective_dict, scaler_dict)

    # Ensemble train labels are previous test labels
    ensemble_train_adjective_dict = test_adjective_dict

    import pdb
    pdb.set_trace()
    for adj in ensemble_train_adjective_dict:
        count = np.sum(ensemble_train_adjective_dict[adj])
        import pdb
        pdb.set_trace()
        print adj + ":  %d " % count

    # Remove the adjectives 'warm' and 'sparse' from the labels dictionaries
    del ensemble_train_adjective_dict['springy']
    del ensemble_test_adjective_dict['springy']
    del ensemble_train_adjective_dict['elastic']
    del ensemble_test_adjective_dict['elastic']
    del ensemble_train_adjective_dict['meshy']
    del ensemble_test_adjective_dict['meshy']
    del ensemble_train_adjective_dict['gritty']
    del ensemble_test_adjective_dict['gritty']
    del ensemble_train_adjective_dict['textured']
    del ensemble_test_adjective_dict['textured']
    del ensemble_train_adjective_dict['absorbant']
    del ensemble_test_adjective_dict['absorbant']
    del ensemble_train_adjective_dict['crinkly']
    del ensemble_test_adjective_dict['crinkly']
    del ensemble_train_adjective_dict['porous']
    del ensemble_test_adjective_dict['porous']
    del ensemble_train_adjective_dict['grainy']
    del ensemble_test_adjective_dict['grainy']

    del ensemble_train_adjective_dict['warm']
    del ensemble_test_adjective_dict['warm']
    del ensemble_train_adjective_dict['sparse']
    del ensemble_test_adjective_dict['sparse']

    # Combine motion-specific classifiers for each adjective
    all_ensemble_classifiers = full_ensemble_train(
        ensemble_train_feature_vector_dict, ensemble_train_adjective_dict,
        ensemble_test_feature_vector_dict, ensemble_test_adjective_dict)

    # Store off combined classifiers
    cPickle.dump(all_ensemble_classifiers,
                 open("all_ensemble_classifiers.pkl", "w"),
                 cPickle.HIGHEST_PROTOCOL)
예제 #13
0
파일: loocv.py 프로젝트: Fdslk/Lab
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)
kf.get_n_splits(X)
print(kf)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("X_train:", X_train, "X_test:", X_test, "y_train:", y_train,
          "y_test:", y_test, "\n")

X = [1, 2, 3, 4]
loo = LeaveOneOut()
for train, test in loo.split(X):
    print("%s%s" % (train, test))

X = np.ones(4)
lpo = LeavePOut(p=1)
for train, test in lpo.split(X):
    print("%s%s" % (train, test))

labels = [1, 1, 2, 2, 3, 3]
lolo = cross_validation.LeaveOneLabelOut(labels)
for train, test in lolo:
    print("%s%s" % (train, test))
예제 #14
0
def create_custom_cv(df):
    labels = df['chunk'].unique()
    lol = cross_validation.LeaveOneLabelOut(labels)
예제 #15
0
            test_data[f] = 0.0
    test_as_records = test_data[FEATURES].to_dict('records')
    Xt = dv.transform(test_as_records)
    Xt = Xt.todense()
    for key in VECTOR_FEATURES:
        Xt = np.concatenate([Xt, np.array(list(test_data[key]))], axis=1)
    Xt[np.isnan(Xt)] = 0.0

    scaler = sklearn.preprocessing.MinMaxScaler()
    X = scaler.fit_transform(X)
    Xt = scaler.transform(Xt)

    predict_test("%s/%s" % (ARGS.output, basename(TEST_FILE)), X, Y, Xt,
                 original_test, test_data)

    # time for actual machine learning

    # need to make sure we don't repeat sick id's across training/test
    sickIDs = np.array(list(set(data["pairIndex"])))
    # randomize
    np.random.seed(RANDOM_SEED)
    np.random.shuffle(sickIDs)
    # count 'em off like assigning groups in class
    splitLookup = dict(izip(sickIDs, cycle(xrange(NUM_CROSS_VAL))))
    # look up each group number to determine the fold
    skf = cross_validation.LeaveOneLabelOut(
        [splitLookup[sickID] for sickID in data['pairIndex']])

    run_crossval("%s/%s" % (ARGS.output, basename(DATA_FILE)), X, Y, skf,
                 original_data, data)
예제 #16
0
    data_co = pd.read_csv("data_co.csv", delimiter=',')

    label = data_au['year']

    random_forest_au = RFCV(data_au)
    random_forest_co = RFCV(data_co)
    """
	if sys.argv[1] == "LOO":
		kf = cv.KFold(n = len(data_au), n_folds = len(data_au))

		pred_au = random_forest_au.random_forest_cv(kf, 1000, 5)
		pred_co = random_forest_co.random_forest_cv(kf, 1000, 5)
	"""

    if sys.argv[1] == "LOL":
        lol = cv.LeaveOneLabelOut(label)

        print("=" * 10, "audience", "=" * 10)
        pred_au = random_forest_au.random_forest_cv(lol, 600, 6)
        print("=" * 10, "congestion rate", "=" * 10)
        pred_co = random_forest_co.random_forest_cv(lol, 500, 4)
    """
	print("=" * 10 + "audience を回帰" + "=" * 10)
	print("RMSE : ", random_forest_au.root_mean_squared_error())
	print("adj.r2 : ", random_forest_au.r2_adj_score())

	print("=" * 10 + "congestion rate を回帰" + "=" * 10)
	print("RMSE : ", random_forest_co.root_mean_squared_error())
	print("adj.r2 : ", random_forest_co.r2_adj_score())

	#残差プロット