Python LabelKFold示例，sklearn.cross_validation.LabelKFold Python示例

示例#1

0

显示文件

def plot_label_kfold():
    from sklearn.cross_validation import LabelKFold
    labels = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

    plt.figure(figsize=(10, 2))
    plt.title("LabelKFold")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 12
    n_samples = 12
    n_iter = 3
    n_samples_per_fold = 1

    cv = LabelKFold(n_folds=3, labels=labels)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(cv.split(range(12))):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter),
                          width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold,
                          height=.6,
                          color=colors,
                          hatch="//")
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.barh(bottom=[n_iter] * n_folds,
              width=[1 - 0.1] * n_folds,
              left=np.arange(n_folds) * n_samples_per_fold,
              height=.6,
              color="w")

    for i in range(12):
        axes.text((i + .5) * n_samples_per_fold,
                  3.5,
                  "%d" % labels[i],
                  horizontalalignment="center")
    #ax.set_ylim(4, -0.1)

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter) + .3)
    axes.set_yticklabels(["Split %d" % x
                          for x in range(1, n_iter + 1)] + ["labels"])
    plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3))
    plt.tight_layout()

示例#2

0

显示文件

def validate_model(model_checkpoint='model.yaml',
                   weights_checkpoint='NNweights_5.h5',
                   useZCA=True,
                   Folds=10):
    model_stream = file(model_checkpoint, 'r')
    test_model = model_from_yaml(yaml.safe_load(model_stream))
    test_model.load_weights(weights_checkpoint)

    # Load and preprocess test set
    x_test, y_test, identities = load_data_with_identity(True)
    x_test = x_test.reshape(x_test.shape[0], 1, 32, 32)
    x_test = preprocess_images(x_test)
    lkf = LabelKFold(identities, n_folds=10)

    if useZCA:
        ZCAMatrix = np.load('ZCAMatrix.npy')
        x_test = np.dot(
            x_test.reshape(x_test.shape[0], x_test.shape[1] * x_test.shape[2] *
                           x_test.shape[3]), ZCAMatrix)
        x_test = x_test.reshape(x_test.shape[0], 1, 32, 32)
        print "Processed test input with ZCAMatrix"

    print "Finished loading test model"

    predictions = test_model.predict_classes(x_test)
    return

示例#3

0

显示文件

文件： load_cv_index.py 项目： valencig/sedesol-public

def get_cv_indices(n_rows, k_folds, seed, substitutions, cv_grouping_cols=None):
    """
    Get cv indices from sklearn

    ENIGH raw data has several expense types per household, and we don't
    want the same household split across several cv folds, so use
    LabelCVFold

    :param string or list of strings cv_grouping_cols The feature(s) to use to
     define groupings in cv folds. We don't want samples within these groups to
     appear in separate folds (because that would give the model extra
     information about test).
    :param int n_rows The number of rows total across which we need to divide
     into folds.
    :param int k_folds The number of cross-validation folds
    :param int seed The random state to use
    :param dict substitutions. The dictionary containing schema and
     table names to substitute into the parameterized SQL.
    """
    if cv_grouping_cols is not None:
        labels = pg_sed.get_original_features(
            cv_grouping_cols,
            substitutions["semantic_schema"],
            substitutions["subset_table"]
        )
        labels = labels.sort_values(by=cv_grouping_cols)
        labels = labels.ix[:, 0].values
        return LabelKFold(labels, k_folds)
    else:
        return KFold(n_rows, k_folds, shuffle=True,
                     random_state=seed)

示例#4

0

显示文件

    def SVC_decode_full(X, Y, n_times, epochs, indexcountlist):

        clf = SVC(C=1, kernel='linear')
        # Define a monte-carlo cross-validation generator (reduce variance):
        #cv = ShuffleSplit(len(X), 10, test_size=0.2,random_state=0)
        #cv = LeaveOneLabelOut(labels = np.concatenate(indexcountlist))
        #cv = LabelShuffleSplit(np.concatenate(labels_percond))
        cv = LabelKFold(labels_percond, n_folds=5)

        cm_return = np.ones((Y[-1] + 1, Y[-1] + 1)) * (1 / Y[-1] + 1)
        Xa = X.reshape(X.shape[0], X.shape[1] * X.shape[2])
        # Standardize features
        Xa -= Xa.mean(axis=0)
        #Xa /= Xa.std(axis=0)
        # Run cross-validation
        # Note : for sklearn the Xt matrix should be 2d (n_samples x n_features)
        preds = np.empty(len(Y))
        for train, test in cv:
            clf.fit(Xa[train], Y[train])
            preds[test] = clf.predict(Xa[test])

        # Normalized confusion matrix
        cm = confusion_matrix(Y, preds)
        cm_normalized = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]
        cm_return[:, :] = cm_normalized

        print 'done'

        return cm_return

示例#5

0

显示文件

    def SVC_decode(X, Y, n_times, epochs, labels_percond):

        Y = Y.astype(int)
        labels_percond = np.concatenate(labels_percond).astype(int)
        clf = SVC(C=1, kernel='linear')
        # Define a monte-carlo cross-validation generator (reduce variance):
        #cv = ShuffleSplit(len(X), 10, test_size=0.2,random_state=0)
        #cv = StratifiedKFold(Y, n_folds=10, shuffle=True, random_state=0)
        #cv = LeaveOneLabelOut(labels = np.concatenate(indexcountlist))
        #cv = LabelShuffleSplit(np.concatenate(labels_percond),n_iter=10)
        cv = LabelKFold(labels_percond, n_folds=5)

        cm_return = np.ones((Y[-1] + 1, Y[-1] + 1, n_times)) * (1 / Y[-1] + 1)
        for t in range(n_times):
            Xt = X[:, :, t]

            # Standardize features
            Xt -= Xt.mean(axis=0)
            #Xt /= Xt.std(axis=0)
            # Run cross-validation
            # Note : for sklearn the Xt matrix should be 2d (n_samples x n_features)
            preds = np.empty(len(Y))
            for train, test in cv:
                clf.fit(Xt[train], Y[train])
                preds[test] = clf.predict(Xt[test])

            # Normalized confusion matrix
            cm = confusion_matrix(Y, preds)
            cm_normalized = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]
            cm_return[:, :, t] = cm_normalized

        print 'done'

        return cm_return

示例#6

0

显示文件

文件： create_predictions.py 项目： lmustfightingfighting/Predicting-Red-Hat-Business-Value

def create_folds(X_train, people, n_folds):
    """ Creates 5-folds from train data for cross validation purposes.
    """

    # Select people ids for each fold
    labels = people['people_id'].unique()
    label_kfold = LabelKFold(labels, n_folds=n_folds)
    cv_folds = []
    for train_index, cv_index in label_kfold:
        cv_folds.append([
            people.ix[train_index, 'people_id'], people.ix[cv_index,
                                                           'people_id']
        ])

    # Identify idencies for people_ids in folds
    train_indices = []
    cv_indices = []
    for fold in cv_folds:
        fold[0] = fold[0].astype(np.int64)
        fold[1] = fold[1].astype(np.int64)
        train_fold = X_train[X_train['people_id'].isin(fold[0])].index
        cv_fold = X_train[X_train['people_id'].isin(fold[1])].index
        train_indices.append(train_fold)
        cv_indices.append(cv_fold)

    return cv_folds, train_indices, cv_indices

示例#7

0

显示文件

def train_regression(iso_results):
    # themodel = sm.GLM(iso_results.freqdata, iso_results.feats, family=sm.families.Binomial())
    # themodel_results = themodel.fit()
    # the method used by glmnet and glmpath is to use the lambdas at a fixed percentage of the maximum lambda
    # which is a super clever way to do this
    fractions = numpy.linspace(0, 1, 100)
    gene_buckets = {
        x: ii
        for (ii, x) in enumerate(set([x[1] for x in iso_results.tbl]))
    }
    bucket_of = numpy.array([gene_buckets[x[1]] for x in iso_results.tbl])
    from sklearn.cross_validation import LabelKFold
    NFOLDS = 10
    mycv = LabelKFold(bucket_of, n_folds=NFOLDS)
    errs = numpy.zeros((NFOLDS, len(fractions)))
    for vv, (trainidxn, testidxn) in enumerate(mycv):
        fit = ro.r.glmpath(iso_results.feats[trainidxn, :],
                           iso_results.freqdata[trainidxn],
                           family="binomial")
        predictions = numpy.asarray(
            ro.r("predict.glmpath")(fit,
                                    iso_results.feats[testidxn],
                                    s=ro.r.seq(0, 1, length=100),
                                    type="response",
                                    mode='norm.fraction',
                                    standardize=False))
        errs[vv, :] = numpy.sum(
            (predictions - iso_results.freqdata[testidxn, numpy.newaxis])**2,
            axis=0)
    fit2 = ro.r.glmpath(iso_results.feats,
                        iso_results.freqdata,
                        family="binomial")
    mytau = numpy.argmin(numpy.mean(errs, axis=0))
    # return themodel_results
    return REGRESSION(fit2, mytau)

示例#8

0

显示文件

文件： adoption_predictor.py 项目： ZahraFarhadi19/user_analytics

    def fit(self, drop_features=[], segments=["adopted", "sporadic", "low"]):
        """

        :param drop_features: list, which features to drop before fit
        :param segments: which user segments to consider while fitting
        :return:
        """
        if not self.transformed:
            self.transform_features()
        user_id, class_labels, features = self._prep_for_fit(
            drop_features, segments=["adopted", "sporadic", "low"])
        # this is a user based model, therefore we want to avoid including same user in Train and Test
        cv_strat = LabelKFold(user_id, n_folds=self.cv_params["folds"])
        # RandomSearch is vastly faster than GridCV with tolerable loss of optimization
        # NB: RandomForest doesn't generally require heavy parm optimization, this is somewhat for posterity here
        random_search = RandomizedSearchCV(self.clf,
                                           param_distributions=self.param_grid,
                                           n_iter=self.n_iter,
                                           cv=cv_strat,
                                           scoring=self.cv_params["scorer"],
                                           n_jobs=self.n_jobs)

        print("running random param search on {} ".format(
            self.clf.__class__.__name__))
        random_search.fit(features, class_labels)
        self._handle_result(random_search, list(features.columns))

示例#9

0

显示文件

def main():
	#model trained on all train data
	X = train_csv_DF.as_matrix(columns=features)
	y = train_csv_DF.as_matrix(columns=['outcome']).ravel()
	test = test_csv_DF.as_matrix(columns=features)
	
	print 'starting fitting random forest'
	
	rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
	rf.fit(X, y)
	
	# 10-Fold Cross validation
	results = []
	kf = LabelKFold(train_csv_DF['people_id'], n_folds=10)
	i = 0
	for train_index, test_index in kf:
		i+=1
		print "folded: " + str(i)
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = y[train_index], y[test_index]
		rf.fit(X_train, y_train)
		predicts =  rf.predict_proba(X_test)[:, 1]
		results.append(roc_auc_score(y_test,predicts))
        
	print results
	print np.mean(results)
	
	rf_predictions =  rf.predict_proba(test)[:, 1]
	writePreditctionToCSV(rf_predictions)

示例#10

0

显示文件

def svm_validation(images, labels, ids):

    lkf = LabelKFold(ids, n_folds=2)

    C = 1.0  # SVM regularization parameter

    svc = svm.SVC(kernel='linear', C=C) 
    rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C)
    poly_svc = svm.SVC(kernel='poly', degree=3, C=C)
    lin_svc = svm.LinearSVC(C=C)

    model_scores = []
    for model in [svc, rbf_svc, poly_svc, lin_svc]:
        model_scores.append(cross_validation.cross_val_score(model, images, labels, cv=lkf).max())

    return model_scores

示例#11

0

显示文件

def get_annotation_folds(annotations, n_folds):
    """
    Return (train_indices, test_indices) folds iterator.
    It is guaranteed forms from the same website can't be both in
    train and test parts.

    We must be careful when splitting the dataset into training and
    evaluation parts: forms from the same domain should be in the same
    "bin". There could be several pages from the same domain, and these
    pages may have duplicate or similar forms (e.g. a search form on each
    page). If we put one such form in training dataset and another in
    evaluation dataset then the metrics will be too optimistic, and they
    can make us to choose wrong features/models. For example,
    train_test_split from scikit-learn shouldn't be used here. To fix it
    LabelKFold from scikit-learn is used.
    """
    return LabelKFold(labels=[get_domain(ann.url) for ann in annotations],
                      n_folds=n_folds)

示例#12

0

显示文件

def get_annotation_folds(urls, n_folds):
    """
    Return (train_indices, test_indices) folds iterator.
    It is guaranteed pages from the same website can't be both in
    train and test parts.

    We must be careful when splitting the dataset into training and
    evaluation parts: pages from the same domain should be in the same
    "bin". There could be several pages from the same domain, and these
    pages may have duplicate or similar link patterns
    (e.g. a particular CSS class for paginator links).
    If we put one such page in a training dataset and another in
    an evaluation dataset then the metrics will be too optimistic,
    and they can make us to choose wrong features/models. It means
    train_test_split from scikit-learn shouldn't be used here
    """
    from sklearn.cross_validation import LabelKFold
    return LabelKFold(labels=[get_domain(url) for url in urls],
                      n_folds=n_folds)

示例#13

0

显示文件

def svm_linear_validation(images, labels, ids):
    lkf = LabelKFold(ids, n_folds=2)

    model_scores = []
    C_values = [1, 10, 100, 1000, 10000]
    for C in C_values:
        model = svm.SVC(kernel='linear', C=C)
        model_scores.append(cross_validation.cross_val_score(model, images, labels, cv=lkf).max())

    plt.plot(C_values, model_scores, label='Validation Set')
    plt.xlabel('C Values')
    plt.ylabel('Classification Rate')
    plt.legend()
    plt.title('Classification Rates for a Linear Kernel')
    plt.ylim([0, 1])
    plt.xscale('log')
    plt.show()

    return model_scores

示例#14

0

显示文件

文件： example_2.py 项目： UEF-BBC/nippy-aca-2019

def evaluate_pipeline(x_train, y_train, x_test, y_test, groups, label='none'):

    plsr = build_pls_model(x_train,
                           y_train,
                           cv_fun=LabelKFold(groups, n_folds=10))
    plsr.fit(x_train, y_train)
    y_pred_train = plsr.predict(x_train).ravel()
    y_pred_test = plsr.predict(x_test).ravel()

    y_pred_test[y_pred_test < 0] = 0

    results = {}
    results['preprocessing'] = label
    results['n_components'] = plsr.n_components
    results['y_true'] = y_test
    results['y_pred'] = y_pred_test
    results['r2_test'] = r2_score(y_test, y_pred_test)
    results['r2_train'] = r2_score(y_train, y_pred_train)
    results['rmse_test'] = np.sqrt(mean_squared_error(y_test, y_pred_test))
    results['rmse_train'] = np.sqrt(mean_squared_error(y_train, y_pred_train))
    return results

示例#15

0

显示文件

文件： solution.py 项目： chaoyue729/Kaggle-Competitions

def split_training_data_set(selected_fold_index):
    # Read file content
    driver_file_content = pd.read_csv(DRIVER_FILE_PATH)
    image_path_array = np.array([
        os.path.join(TRAINING_FOLDER_PATH, current_row["classname"],
                     current_row["img"])
        for _, current_row in driver_file_content.iterrows()
    ])
    label_array = driver_file_content["classname"].as_matrix()
    subject_array = driver_file_content["subject"].as_matrix()

    # Split the training data set with respect to the subject
    cv_object = LabelKFold(subject_array, n_folds=FOLD_NUM)
    for fold_index, (train_indexes, validate_indexes) in enumerate(cv_object):
        if fold_index == selected_fold_index:
            print("Choosing subjects {} as the validation data set.".format(
                str(np.unique(subject_array[validate_indexes]))))
            print(
                "The training and validation data sets contain {} and {} images, respectively."
                .format(len(train_indexes), len(validate_indexes)))
            return image_path_array[train_indexes], label_array[train_indexes], \
                image_path_array[validate_indexes], label_array[validate_indexes]

示例#16

0

显示文件

def load_train(imgs_folder, drivers_path):
    drivers = load_drivers(drivers_path)

    X, Y, D = [], [], []
    path = os.path.join(imgs_folder, 'train')
    for lbl, driver, fn in get_training_images(path, drivers):
        X.append(fn)
        Y.append(lbl)
        D.append(driver)
    logger.info("Loaded %d images" % len(Y))

    X = np.array(X)
    Y = np.array(Y).astype('int32')
    D = np.array(D)

    logger.info("Splitting local validation set")
    X, Y, D = shuffle(X, Y, D)
    cv = LabelKFold(D, 10)
    index_s, index_t = next(iter(cv))
    Xs, Xt = X[index_s], X[index_t]
    Ys, Yt = Y[index_s], Y[index_t]

    return Xs, Ys, Xt, Yt

示例#17

0

显示文件

文件： train_pretrained2.py 项目： nickmcadden/Kaggle

        layer_updates = lasagne.updates.nesterov_momentum(
            loss, layer_params, learning_rate=layer_lr, momentum=0.9)
        updates.update(layer_updates)

# Compile functions for training, validation and prediction
train_fn = theano.function([X_sym, y_sym], loss, updates=updates)
val_fn = theano.function([X_sym, y_sym], [loss, acc])
pred_fn = theano.function([X_sym], prediction)

X, y, drivers = load_train_data('../input',
                                grayscale=False,
                                img_shape=IMG_SHAPE,
                                usecache=True)
X, y, drivers = sklearn.utils.shuffle(X, y, drivers, random_state=0)

train_split = LabelKFold(drivers, n_folds=13)


# generator splitting an iterable into chunks of maximum length N
def batches(iterable, N):
    chunk = []
    for item in iterable:
        chunk.append(item)
        if len(chunk) == N:
            yield chunk
            chunk = []
    if chunk:
        yield chunk


# We need a fairly small batch size to fit a large network like this in GPU memory

示例#18

0

显示文件

def validation_score(images, labels, model, ids):
    lkf = LabelKFold(ids, n_folds=2)  # 71% with 10 folds, 68% with 2 folds

    return cross_validation.cross_val_score(model, images, labels,
                                            cv=lkf)  # cv=3 gets 61%

示例#19

0

显示文件

computed_test = pd.read_csv("input/Submissionaid.csv")

filter_aid = computed_test.outcome.isnull()

null_aid = computed_test[filter_aid].activity_id

X_test1 = X_test[X_test.activity_id.isin(null_aid)]

print len(X_test), len(X_test1), len(null_aid)

X_test1_activity_id = X_test1.activity_id
X_test1 = X_test1[features].drop(['people_id', 'activity_id'], axis=1)

from sklearn.cross_validation import LabelKFold
kf = LabelKFold(train_group_df['group_1'], n_folds=5)
i = 0
for train_index, cv_index in kf:
    X_sample_train = X.iloc[train_index]
    X_sample_eval = X.iloc[cv_index]
    X_sample_eval_act = X_sample_eval.activity_id
    del X_sample_train['activity_id']
    del X_sample_eval['activity_id']
    del X_sample_train['people_id']
    del X_sample_eval['people_id']
    print len(cv_index), len(train_index)
    dtrain = xgb.DMatrix(X_sample_train, label=y.iloc[train_index])
    dval = xgb.DMatrix(X_sample_eval, label=y.iloc[cv_index])
    param = {
        'max_depth': 6,
        'eta': 0.03,

示例#20

0

显示文件

文件： process_datashop.py 项目： weiwei5444/pyAFM

                coef_qint.setdefault(kc, 0.0),
                invlogit(coef_qint.setdefault(kc, 0.0)),
                coef_qslope.setdefault(kc, 0.0)
            ])

        cvs = [('Unstratified CV',
                KFold(len(y),
                      n_folds=args.nfolds,
                      shuffle=True,
                      random_state=args.seed)),
               ('Stratified CV',
                StratifiedKFold(y,
                                n_folds=args.nfolds,
                                shuffle=True,
                                random_state=args.seed)),
               ('Student CV', LabelKFold(student_label, n_folds=args.nfolds)),
               ('Item CV', LabelKFold(item_label, n_folds=args.nfolds))]

        scores_header = []
        scores = []
        for cv_name, cv in cvs:
            score = []
            for train_index, test_index in cv:
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                m.fit(X_train, y_train)
                score.append(m.mean_squared_error(X_test, y_test))
            scores_header.append(cv_name)
            scores.append(np.mean(np.sqrt(score)))

        print()

示例#21

0

显示文件

gc.collect()
np.random.seed(15)
IMG_SHAPE = 96, 96
path = '../input'

X, y, drivers = load_train_data(path, grayscale=False, img_shape=IMG_SHAPE)
X, y, drivers = shuffle(X, y, drivers, random_state=0)

mean_pixel = [0.466013, 0.532114, 0.518073]

for c in range(3):
    X[:, c, :, :] = X[:, c, :, :] - mean_pixel[c]

print("input shape", X.shape)

train_split = CVTrainSplit(LabelKFold(drivers, n_folds=5))
batch_iterator_train = RotateBatchIterator(10, 0.5, 128, True)

layer = [(layers.InputLayer, {
    'shape': (None, X.shape[1], X.shape[2], X.shape[3])
}),
         (layers.Conv2DLayer, {
             'num_filters': 32,
             'filter_size': 3,
             'pad': 'same',
             'nonlinearity': nonlinearities.rectify
         }), (layers.MaxPool2DLayer, {
             'pool_size': 2
         }), (layers.DropoutLayer, {
             'p': 0.5
         }),

示例#22

0

显示文件

文件： cStress_model_param_parallel_spark.py 项目： anjanatiha/Distributed-Machine-Learning-for-Bio-marker-Detection-from-Wearable-Sensor-Big-Data

def cstress_spark_parallel_param_model_main():
    features = read_features(args.featureFolder, args.featureFile)
    groundtruth = read_stress_marks(args.featureFolder, args.stressFile)

    traindata, trainlabels, subjects = analyze_events_with_features(
        features, groundtruth)

    traindata = np.asarray(traindata, dtype=np.float64)
    trainlabels = np.asarray(trainlabels)

    normalizer = preprocessing.StandardScaler()
    traindata = normalizer.fit_transform(traindata)

    lkf = LabelKFold(subjects, n_folds=len(np.unique(subjects)))

    # Original Parameters of cStress Model
    # delta = 0.1
    # parameters = {'kernel': ['rbf'],
    #               'C': [2 ** x for x in np.arange(-12, 12, 0.5)],
    #               'gamma': [2 ** x for x in np.arange(-12, 12, 0.5)],
    #               'class_weight': [{0: w, 1: 1 - w} for w in np.arange(0.0, 1.0, delta)]}

    # parameters for testing
    delta = 0.5
    parameters = {
        'kernel': ['rbf'],
        'C': [2**x for x in np.arange(-2, 2, 0.5)],
        'gamma': [2**x for x in np.arange(-2, 2, 0.5)],
        'class_weight': [{
            0: w,
            1: 1 - w
        } for w in np.arange(0.0, 1.0, delta)]
    }

    svc = svm.SVC(probability=True, verbose=False, cache_size=2000)

    if args.scorer == 'f1':
        scorer = f1_bias_scorer_CV
    else:
        scorer = two_bias_scorer_CV

    if args.whichsearch == 'grid':
        clf = GridSearchCVSparkParallelParam(sc=sc,
                                             estimator=svc,
                                             param_grid=parameters,
                                             cv=lkf,
                                             n_jobs=-1,
                                             scoring=scorer,
                                             verbose=1,
                                             iid=False)
    else:
        clf = RandomGridSearchCVSparkParallelParam(
            sc,
            estimator=svc,
            param_distributions=parameters,
            cv=lkf,
            n_jobs=-1,
            scoring=scorer,
            n_iter=args.n_iter,
            verbose=1,
            iid=False)

    clf.fit(traindata, trainlabels)

    sc.stop()
    SparkSession._instantiatedContext = None

    print("best score: ", clf.best_score_)
    print("best params: ", clf.best_params_)

    CV_probs = cross_val_probs(clf.best_estimator_, traindata, trainlabels,
                               lkf)
    score, bias = scorer(CV_probs, trainlabels, True)
    print("score and bias: ", score, bias)

    if not bias == []:
        save_model(args.modelOutput, clf.best_estimator_, normalizer, bias)

        n = len(trainlabels)

        if args.scorer == 'f1':
            predicted = np.asarray(CV_probs >= bias, dtype=np.int)
            classified = range(n)
        else:
            classified = np.where(
                np.logical_or(CV_probs <= bias[0], CV_probs >= bias[1]))[0]
            predicted = np.asarray(CV_probs[classified] >= bias[1],
                                   dtype=np.int)

        print("Cross-Subject (" + str(len(np.unique(subjects))) +
              "-fold) Validation Prediction")
        print("Accuracy: " +
              str(metrics.accuracy_score(trainlabels[classified], predicted)))
        print(metrics.classification_report(trainlabels[classified],
                                            predicted))
        print(metrics.confusion_matrix(trainlabels[classified], predicted))
        print("Lost: %d (%f%%)" % (n - len(classified),
                                   (n - len(classified)) * 1.0 / n))
        print("Subjects: " + str(np.unique(subjects)))
    else:
        print("Results not good")

示例#23

0

显示文件

def run_kfold_tree(nfolds,
                   train,
                   test,
                   features,
                   target,
                   label_cv=0,
                   labels=0,
                   random_state=0):
    eta = 0.2
    max_depth = 100
    subsample = 1
    colsample_bytree = 1
    min_child_w = 0
    start_time = time.time()
    eta_list = [0.5] * 100 + [0.1] * 2900
    #    eta_list  = [0.1]*10000
    print(
        'XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'
        .format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster": "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "min_child_weight": min_child_w,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 3000
    early_stopping_rounds = 20
    train_y = train[target]
    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['group_1', 'act_days']].astype(object))
    if label_cv == 1:  # maker sure the cv folds don't share the same labels
        kf = LabelKFold(labels, n_folds=nfolds)
    else:
        kf = StratifiedKFold(train_y,
                             n_folds=nfolds,
                             shuffle=True,
                             random_state=random_state)
    num_fold = 0
    for train_index, test_index in kf:
        num_fold += 1
        print('Start fold {} from {}'.format(num_fold, nfolds))
        X_train, X_valid = train[features].as_matrix(
        )[train_index], train[features].as_matrix()[test_index]
        y_train, y_valid = train_y[train_index], train_y[test_index]
        X_test = test[features].as_matrix()

        print('Length train:', len(X_train))
        print('Length valid:', len(X_valid))

        dtrain = xgb.DMatrix(X_train, y_train)
        dvalid = xgb.DMatrix(X_valid, y_valid)

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        gbm = xgb.train(params,
                        dtrain,
                        num_boost_round,
                        evals=watchlist,
                        learning_rates=eta_list,
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=True)

        print("Validating...")
        yhat = gbm.predict(xgb.DMatrix(X_valid),
                           ntree_limit=gbm.best_iteration + 1)
        score = roc_auc_score(y_valid.tolist(), yhat)
        print('Check error value: {:.6f}'.format(score))

        # Each time store portion of precicted data in train predicted values
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = yhat[i]

        imp = get_importance(gbm, features)
        print('Importance array: ', imp)

        print("Predict test set...")
        test_prediction = gbm.predict(xgb.DMatrix(X_test),
                                      ntree_limit=gbm.best_iteration + 1)
        yfull_test['kfold_' + str(num_fold)] = test_prediction

    # Copy dict to list
    train_res = []
    for i in sorted(yfull_train.keys()):
        train_res.append(yfull_train[i])

    score = roc_auc_score(train_y, np.array(train_res))
    print('Check error value: {:.6f}'.format(score))

    # Find mean for KFolds on test
    merge = []
    for i in range(1, nfolds + 1):
        merge.append('kfold_' + str(i))
    yfull_test['mean'] = yfull_test[merge].mean(axis=1)

    print('Training time: {} minutes'.format(
        round((time.time() - start_time) / 60, 2)))
    return yfull_test['mean'].values, score

示例#24

0

显示文件

def p_printResults(t):
    'printResults : PRINT'
    t[0] = t[1]
    if t[1].lower() == "printbestclassifier()":
        verifyTrainingData()
        from sklearn.cross_validation import LabelKFold  #no poermite overlaping
        print "============================================"
        print "Calculating best Classifier: "
        features = trainingData[:, featuresColumn]
        classVector = trainingData[:, classColumn]
        label = numpy.array(range(len(classVector)))
        lkf = LabelKFold(label, k_fold)
        #
        iterationNum = 0
        #scoring avg:
        avgScoring = {
            'svc': {
                'accuracy': 0,
                'precision': 0,
                'recall': 0,
                'fscore': 0,
                'name': 'Support Vector Classifier'
            },
            'dtc': {
                'accuracy': 0,
                'precision': 0,
                'recall': 0,
                'fscore': 0,
                'name': 'Decision Tree Classifier'
            },
            'gnbc': {
                'accuracy': 0,
                'precision': 0,
                'recall': 0,
                'fscore': 0,
                'name': 'GaussianNB'
            }
        }

        for train_ind, test_ind in lkf:
            X_train, X_test = features[train_ind], features[test_ind]
            y_train, y_test = classVector[train_ind], classVector[test_ind]
            from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
            from sklearn import svm, tree
            from sklearn.naive_bayes import GaussianNB

            svc_prediction = svm.SVC().fit(X_train, y_train).predict(X_test)
            dtc_prediction = tree.DecisionTreeClassifier().fit(
                X_train, y_train).predict(X_test)
            gnbc_prediction = GaussianNB().fit(X_train,
                                               y_train).predict(X_test)

            #scoring
            scoreResult = precision_recall_fscore_support(y_test,
                                                          svc_prediction,
                                                          beta=1.0,
                                                          average='micro')
            avgScoring['svc'][
                'accuracy'] = avgScoring['svc']['accuracy'] + accuracy_score(
                    y_test, svc_prediction)
            avgScoring['svc'][
                'precision'] = avgScoring['svc']['precision'] + scoreResult[0]
            avgScoring['svc'][
                'recall'] = avgScoring['svc']['recall'] + scoreResult[1]
            avgScoring['svc'][
                'fscore'] = avgScoring['svc']['fscore'] + scoreResult[2]

            scoreResult = precision_recall_fscore_support(y_test,
                                                          dtc_prediction,
                                                          beta=1.0,
                                                          average='micro')
            avgScoring['dtc'][
                'accuracy'] = avgScoring['svc']['accuracy'] + accuracy_score(
                    y_test, dtc_prediction)
            avgScoring['dtc'][
                'precision'] = avgScoring['svc']['precision'] + scoreResult[0]
            avgScoring['dtc'][
                'recall'] = avgScoring['svc']['recall'] + scoreResult[1]
            avgScoring['dtc'][
                'fscore'] = avgScoring['svc']['fscore'] + scoreResult[2]

            scoreResult = precision_recall_fscore_support(y_test,
                                                          gnbc_prediction,
                                                          beta=1.0,
                                                          average='micro')
            avgScoring['gnbc'][
                'accuracy'] = avgScoring['svc']['accuracy'] + accuracy_score(
                    y_test, gnbc_prediction)
            avgScoring['gnbc'][
                'precision'] = avgScoring['svc']['precision'] + scoreResult[0]
            avgScoring['gnbc'][
                'recall'] = avgScoring['svc']['recall'] + scoreResult[1]
            avgScoring['gnbc'][
                'fscore'] = avgScoring['svc']['fscore'] + scoreResult[2]

            iterationNum = iterationNum + 1

        scoreResult = precision_recall_fscore_support(y_test,
                                                      svc_prediction,
                                                      beta=1.0,
                                                      average='micro')
        avgScoring['svc']['accuracy'] = avgScoring['svc']['accuracy'] / k_fold
        avgScoring['svc'][
            'precision'] = avgScoring['svc']['precision'] / k_fold
        avgScoring['svc']['recall'] = avgScoring['svc']['recall'] / k_fold
        avgScoring['svc']['fscore'] = avgScoring['svc']['fscore'] / k_fold

        scoreResult = precision_recall_fscore_support(y_test,
                                                      dtc_prediction,
                                                      beta=1.0,
                                                      average='micro')
        avgScoring['dtc']['accuracy'] = avgScoring['svc']['accuracy'] / k_fold
        avgScoring['dtc'][
            'precision'] = avgScoring['svc']['precision'] / k_fold
        avgScoring['dtc']['recall'] = avgScoring['svc']['recall'] / k_fold
        avgScoring['dtc']['fscore'] = avgScoring['svc']['fscore'] / k_fold

        scoreResult = precision_recall_fscore_support(y_test,
                                                      gnbc_prediction,
                                                      beta=1.0,
                                                      average='micro')
        avgScoring['gnbc']['accuracy'] = avgScoring['svc']['accuracy'] / k_fold
        avgScoring['gnbc'][
            'precision'] = avgScoring['svc']['precision'] / k_fold
        avgScoring['gnbc']['recall'] = avgScoring['svc']['recall'] / k_fold
        avgScoring['gnbc']['fscore'] = avgScoring['svc']['fscore'] / k_fold

        #calculate best one
        bestclassifier = {'accuracy': 0, 'name': []}
        """
        if avgScoring['svc']['accuracy'] < avgScoring['dtc']['accuracy']:
            bestclassifier[0] = 'Decision Tree'
        else:
            bestclassifier.append('Decision Tree')
            """
        for key in avgScoring:
            if avgScoring[key]['accuracy'] > bestclassifier['accuracy']:
                bestclassifier['accuracy'] = avgScoring[key]['accuracy']
                bestclassifier['name'] = [avgScoring[key]['name']]
            elif avgScoring[key]['accuracy'] == bestclassifier['accuracy']:
                bestclassifier['accuracy'] = avgScoring[key]['accuracy']
                bestclassifier['name'].append(avgScoring[key]['name'])

        print "Best Classifier(s): ", ','.join(
            bestclassifier['name']), ". With an accuracy of: ", (
                bestclassifier['accuracy'] * 100), "%"

示例#25

0

显示文件

文件： models.py 项目： beardeer/assistments_workbench

def classification_model(model,
                         data,
                         predictors,
                         label,
                         categorical_features=None,
                         cv_label_name=None,
                         k=5,
                         test_size=0.1,
                         n_iter=100,
                         train_only=False):
    data_len = len(data)
    cv = None
    auc, r2, rmse, acc = [], [], [], []

    # print 'Predictors:', predictors
    predictors = [p.strip() for p in predictors]

    if cv_label_name is not None:
        cv_label = data[cv_label_name]
    else:
        cv_label = None

    if k is not None and cv_label is not None:
        cv = LabelKFold(cv_label, n_folds=k)
    elif k is not None and cv_label is None:
        cv = KFold(data_len, n_folds=k, shuffle=True)

    if k is None and test_size is not None and n_iter is not None and cv_label is not None:
        cv = LabelShuffleSplit(cv_label,
                               n_iter=n_iter,
                               test_size=test_size,
                               random_state=42)
    if k is None and test_size is not None and n_iter is not None and cv_label is None:
        cv = ShuffleSplit(data_len,
                          n_iter=n_iter,
                          test_size=test_size,
                          random_state=42)

    for train, test in cv:
        x_train = (data[predictors].iloc[train, :])
        y_train = data[label].iloc[train]
        x_test = (data[predictors].iloc[test, :])
        y_test = data[label].iloc[test]

        if categorical_features is not None:
            feature_idxs = [
                x_train.columns.get_loc(name) for name in categorical_features
            ]
            encoder = OneHotEncoder(categorical_features=feature_idxs)
            encoder.fit(np.vstack((x_train, x_test)))
            x_train = encoder.transform(x_train)
            x_test = encoder.transform(x_test)

        model.fit(x_train, y_train)
        if train_only:
            x_test = x_train
            y_test = y_train
        y_pred_p = model.predict_proba(x_test)[:, 1]
        y_pred_c = model.predict(x_test)

        a, b, c, d = binary_classification_metrics(y_test, y_pred_p, y_pred_c)

        auc.append(a)
        r2.append(b)
        rmse.append(c)
        acc.append(d)

    # print 'auc:', a
    # print 'r2:', b
    # print 'rmse:', c
    # print 'accuracy:', d

    return np.mean(auc), np.mean(r2), np.mean(rmse), np.mean(acc)

示例#26

0

显示文件

def perform_training(image_feature_list, image_index_list, description,
                     feature_extension):
    """Perform training phase.
    
    :param image_feature_list: the features of the images
    :type image_feature_list: list
    :param image_index_list: the indexes of the images
    :type image_index_list: list
    :param description: the folder name of the working directory
    :type description: string
    :param feature_extension: the extension of the feature files
    :type feature_extension: string
    :return: the model files will be saved to disk
    :rtype: None
    """

    print("Performing training phase ...")

    # Reset the working directory
    common.reset_working_directory(description)
    working_directory = common.get_working_directory(description)

    # Cross Validation
    fold_num = 5
    best_score_array = np.zeros(fold_num)
    label_kfold = LabelKFold(image_index_list, n_folds=fold_num)

    # Add progress bar
    progress_bar = pyprind.ProgBar(fold_num, monitor=True)

    metric_list = METRIC_LIST_DICT[feature_extension]
    for fold_index, fold_item in enumerate(label_kfold):
        print("\nWorking on the {:d}/{:d} fold ...".format(
            fold_index + 1, fold_num))

        # Generate final data set
        X_train, Y_train = solution_basic.convert_to_final_data_set(
            image_feature_list, image_index_list, fold_item[0], 1, metric_list)
        X_test, Y_test = solution_basic.convert_to_final_data_set(
            image_feature_list, image_index_list, fold_item[1], None,
            metric_list)

        # Perform training
        model_name = "Model_{:d}".format(fold_index +
                                         1) + common.SCIKIT_LEARN_EXTENSION
        model_path = os.path.join(working_directory, model_name)
        best_score = sklearn_related.train_model(X_train, Y_train, X_test,
                                                 Y_test, model_path)
        best_score_array[fold_index] = best_score

        print(
            "For the {:d} fold, the sklearn model achieved the score {:.4f}.".
            format(fold_index + 1, best_score))

        # Update progress bar
        progress_bar.update()

    # Report tracking information
    print(progress_bar)

    print("\nThe best score is {:.4f}.".format(np.max(best_score_array)))

示例#27

0

显示文件

# We need a fairly small batch size to fit a large network like this in GPU memory
BATCH_SIZE = 12


def train_batch(ix):
    return train_fn(X_tr[ix], y_tr[ix])


def val_batch():
    ix = range(len(y_val))
    np.random.shuffle(ix)
    ix = ix[:BATCH_SIZE]
    return val_fn(X_val[ix], y_val[ix])


kf = LabelKFold(drivers, n_folds=8)

lasagne.layers.set_all_param_values(net['prob'], d['param values'])
for i, (tr_ix, val_ix) in enumerate(kf):
    print('CV Fold', i)
    X_tr = X[tr_ix]
    y_tr = y[tr_ix]
    X_val = X[val_ix]
    y_val = y[val_ix]

    #net['new_output'] = DenseLayer(net['drop7'], num_units=10, nonlinearity=softmax, W=lasagne.init.Normal(0.01))
    lasagne.layers.set_all_param_values(net['prob'], d['param values'])
    learning_rate.set_value(0.0001)

    for epoch in range(5):

示例#28

0

显示文件

def p_classifier_methods(t):
    '''classifier_methods : CLASSIFIER_METHOD
                            | CLASSIFIER_METHOD_WPARAMETER '(' ')' '''
    t[0] = t[1]
    global classifier, classResult, testClassColumn, testFeaturesColumn
    #si es un methodo que requiere parametro se ejecuta en el if
    if len(t) > 2:
        if t[1].lower() == "predict":
            verifyTestData()
            verifyClassifier()
            #execute predict
            if testClassColumn == None:
                testClassColumn = classColumn
            if testFeaturesColumn == None:
                testFeaturesColumn = featuresColumn

            featuresVect = testData[:, testFeaturesColumn]
            classResult = classifier.predict(featuresVect)
            print "Result:"
            print classResult
    else:
        if t[1].lower() == "getcverrorrate()":
            #execute getErrorRate
            if len(cv_scoring) > 0:
                print "Avg scoring for the cross-validation:"
                print "Accuracy: ", cv_scoring['avg scorinmg'][
                    'accuracy'], " Precision: ", cv_scoring['avg scorinmg'][
                        'precision'], " Recall: ", cv_scoring['avg scorinmg'][
                            'recall'], "F-score ", cv_scoring['avg scorinmg'][
                                'fscore']
            else:
                print "No cross-validation scoring on system"
        elif t[1].lower() == "execute()":
            verifyClassifier()
            verifyTrainingData()
            global lastmethod
            lastmethod = "execute"
            features = trainingData[:, featuresColumn]
            classVector = trainingData[:, classColumn]
            classifier.fit(features, classVector)

        elif t[1].lower() == "executecv()":
            verifyClassifier()
            verifyTrainingData()
            lastmethod = "executecv"
            from sklearn.cross_validation import LabelKFold  #no poermite overlaping
            print "============================================"
            print "Doing Cross-Validation: "
            features = trainingData[:, featuresColumn]
            classVector = trainingData[:, classColumn]
            label = numpy.array(range(len(classVector)))
            lkf = LabelKFold(label, k_fold)
            #
            iterationNum = 0
            #scoring avg:
            accuracyAvg = 0
            precisionAvg = 0
            recallAvg = 0
            fbetaScoreAvg = 0
            for train_ind, test_ind in lkf:
                X_train, X_test = features[train_ind], features[test_ind]
                y_train, y_test = classVector[train_ind], classVector[test_ind]
                from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

                classifier.fit(X_train, y_train)
                y_predicted = classifier.predict(X_test)

                #scoring
                accuracy = accuracy_score(y_test, y_predicted)
                scoreResult = precision_recall_fscore_support(y_test,
                                                              y_predicted,
                                                              beta=1.0,
                                                              average='micro')
                precision = scoreResult[0]
                recall = scoreResult[1]
                fbetaScore = scoreResult[2]
                cm = confusion_matrix(y_test, y_predicted)

                #adding to avg
                accuracyAvg = accuracyAvg + accuracy
                precisionAvg = precisionAvg + precision
                recallAvg = recallAvg + recall
                fbetaScoreAvg = fbetaScoreAvg + fbetaScore

                #adding result
                cv_fold_result['fold_' + str(iterationNum)] = {
                    'truth': y_test,
                    'prediction': y_predicted
                }
                cv_scoring['fold_' + str(iterationNum)] = {
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'fscore': fbetaScore,
                    'confusion matrix': cm
                }
                print "---------------------------------------"
                print "For fold ", iterationNum
                print "Confusion matrix is:"
                print confusion_matrix(y_test, y_predicted)
                print "Accuracy: ", accuracy, " Precision: ", precision, " Recall: ", recall, 'F-score', fbetaScore

                iterationNum = iterationNum + 1
            accuracyAvg = accuracyAvg / k_fold
            precisionAvg = precisionAvg / k_fold
            recallAvg = recallAvg / k_fold
            fbetaScoreAvg = fbetaScoreAvg / k_fold
            cv_scoring['avg scorinmg'] = {
                'accuracy': accuracyAvg,
                'precision': precisionAvg,
                'recall': recallAvg,
                'fscore': fbetaScoreAvg
            }
            print "---------------------------------------"
            print "Avg scoring:"
            print "Accuracy: ", accuracyAvg, " Precision: ", precisionAvg, " Recall: ", recallAvg, "F-score ", fbetaScoreAvg

示例#29

0

显示文件

# We need a fairly small batch size to fit a large network like this in GPU memory
BATCH_SIZE = 12


def train_batch(ix):
    return train_fn(X_tr[ix], y_tr[ix])


def val_batch():
    ix = range(len(y_val))
    np.random.shuffle(ix)
    ix = ix[:BATCH_SIZE]
    return val_fn(X_val[ix], y_val[ix])


kf = LabelKFold(ids, n_folds=8)

print(X.shape)

lasagne.layers.set_all_param_values(net['prob'], d['param values'])

for i, (tr_ix, val_ix) in enumerate(kf):
    print('CV Fold', i)
    X_tr = X[tr_ix]
    y_tr = y[tr_ix]
    X_val = X[val_ix]
    y_val = y[val_ix]

    net['new_output'] = DenseLayer(net['drop7'],
                                   num_units=8,
                                   nonlinearity=softmax,

示例#30

0

显示文件

def greedy_search(datadir, seg_length_supervised=1024., n_folds=5):

    features, labels, lc, hr, tstart, nseg = load_features(datadir,
                                                     seg_length_supervised)

    features_lb, labels_lb, lc_lb, hr_lb, tstart_lb, nseg_lb = labelled_data(features,
                                                                    labels,
                                                                    lc, hr, tstart, nseg)

    labels_all = np.hstack([labels_lb["train"], labels_lb["val"], labels_lb["test"]])

    fscaled, fscaled_lb = scale_features(features, features_lb)
    features_train = features_lb["train"]
    features_val = features_lb["val"]
    features_test = features_lb["test"]
    labels_train = labels_lb["train"]
    labels_val =  labels_lb["val"]
    labels_test = labels_lb["test"]

    score_all = [] 
    feature_ranking = []
    nfeatures = range(features_train.shape[1])
    features_new_train = []
    features_new_val = []
    features_new_test = []
    best_params_all = []

    for i in range(features_train.shape[1]):
        print("I am on the %ith loop"%i)
        score = []
        best_params = []
        ## first feature
        for j in nfeatures:
            if j in feature_ranking:
                continue
            if len(features_new_train) == 0:
                ft = np.atleast_2d(features_train[:,j]).T
                fv = np.atleast_2d(features_val[:,j]).T
                fte = np.atleast_2d(features_test[:,j]).T
            else:
                ft = np.vstack([features_new_train.T, features_train[:,j]]).T
                fv = np.vstack([features_new_val.T, features_val[:,j]]).T
                fte = np.vstack([features_new_test.T, features_test[:,j]]).T
            ### scale features
            f_all = np.vstack([ft, fv, fte]) 
            ### Random Forest Classifier
 
            scaler_train = StandardScaler().fit(ft)
            fscaled_train = scaler_train.transform(ft)
            fscaled_val = scaler_train.transform(fv)

            lr = LogisticRegression(penalty="l2", class_weight="balanced", multi_class="multinomial",
                        solver="lbfgs")

            group_kfold = LabelKFold(nseg_lb["train"], n_folds=n_folds)

            params = {'C': [0.01, 0.1, 1.0, 10.0]}#,
           
            # instantiate the GridSearchCV object for searching over regularization parameters
            grid_lr = GridSearchCV(lr, params, cv=group_kfold, verbose=3, n_jobs=2,
                                             scoring="f1_weighted")
 
            grid_lr.fit(fscaled_train, labels_train)

            best_params.append(grid_lr.best_params_)
            score.append(grid_lr.score(fscaled_val, labels_val))
        score_all.append(score)
        best_params_all.append(best_params)
        mean_scores = np.array([np.mean(s) for s in score])
        best_ind = np.where(mean_scores == np.max(mean_scores))[0]
        if len(best_ind) > 1:
            best_ind = best_ind[0]
        print("The best score in round " + str(i) + " is " + str(np.max(score)))
        n_best = nfeatures.pop(best_ind)
        print("The best-ranked feature in round " + str(i) + " is " + str(n_best))
        feature_ranking.append(n_best)
        if len(features_new_train) == 0:
            features_new_train = np.atleast_2d(features_train[:,n_best]).T
            features_new_val = np.atleast_2d(features_val[:,n_best]).T
            features_new_test = np.atleast_2d(features_test[:,n_best]).T
        else:
            features_new_train = np.concatenate((features_new_train,
                                                 np.atleast_2d(features_train[:,n_best]).T), 1)
            features_new_val = np.concatenate((features_new_val,
                                               np.atleast_2d(features_val[:,n_best]).T), 1)
            features_new_test = np.concatenate((features_new_test,
                                                np.atleast_2d(features_test[:,n_best]).T), 1)

    features_new_train = np.array([features["train"][:,i] for i in feature_ranking]).T
    features_new_test = np.array([features["test"][:,i] for i in feature_ranking]).T
    features_new_val = np.array([features["val"][:,i] for i in feature_ranking]).T

    res = {"ranking":feature_ranking, "fnew_train":features_new_train,
           "fnew_val":features_new_val, "fnew_test":features_new_test,
           "scores":score_all}  

    f = open(datadir+"grs1915_greedysearch_res.dat", "w")
    pickle.dump(res, f, -1)
    f.close()

    return