Python ShuffleSplit.get_n_splits 예제들, sklearn.model_selection.ShuffleSplit.get_n_splits Python 예제들

예제 #1

0

파일 보기

파일: topKPCExp.py 프로젝트: qianliwang/DimensionReduction

def doExp(datasetPath, targetEpsilon, numOfRounds):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath);
    else:
        data = np.loadtxt(datasetPath, delimiter=",");
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=1, random_state=0);
    rs.get_n_splits(data);
    print "Samples: %d, Features: %d" % (data.shape[0],data.shape[1]-1);

    # p = Pool(numOfRounds);
    cprResult = [];
    m = 0;
    for train_index, test_index in rs.split(data):
        print "Trail %d" % m;
        trainingData = data[train_index,1:];
        tmpResult = singleExp(trainingData,targetEpsilon);
        cprResult.extend(tmpResult);
        m += 1;
        # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature));
        # cprResult += tmpResult.get();

    # Compute the average value after numOfRounds experiments.
    # p.close();
    # p.join();

    return cprResult;

예제 #2

0

파일 보기

def get_features_importance(X, y, variables):
    """
Run random forest on data, multiple times. Look for loss of f1_score for each variable.
Return sorted list of most important variables.
    :param X: train array
    :param y: target array
    :param variables: list of variables to look at, in right order. See ComputeFeatures.handled_variables.
    :return: dictionary with list of f1_score for each variable.
    """

    sp = ShuffleSplit(n_splits=5, test_size=.2)
    sp.get_n_splits(X)

    scores = defaultdict(list)

    names = variables

    for train_idx, test_idx in sp.split(X):
        x_train, x_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model = DecisionTreeClassifier()
        model.fit(x_train, y_train)
        acc = f1_score(y_test, model.predict(x_test))
        for i in range(X.shape[1]):
            X_t = x_test.copy()
            np.random.shuffle(X_t[:, i])
            shuff_acc = f1_score(y_test, model.predict(X_t))
            scores[names[i]].append((acc - shuff_acc) / acc)
    print("Features sorted by their score:")
    print(
        sorted([(np.round(np.mean(score), 4), feat)
                for feat, score in scores.items()],
               reverse=True))
    return scores

예제 #3

0

파일 보기

def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)
    cv_sets.get_n_splits(X)
    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': [1, 2, 4, 5, 6, 7, 8, 9, 10]}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search cv object --> GridSearchCV()
    # Make sure to include the right parameters in the object:
    # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
    grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

예제 #4

0

파일 보기

파일: data_preparation.py 프로젝트: josemanuel097/Unet_AIP

def split_data_train_val_test(px_fol):

    patients = os.listdir(px_fol)
    patients = np.asarray(patients)

    ss = ShuffleSplit(n_splits=1, test_size=0.20)
    ss.get_n_splits(patients)
    for train_index, test_index in ss.split(patients):
        xt, x_test = patients[train_index], patients[test_index]

    ss = ShuffleSplit(n_splits=1, test_size=0.20)
    ss.get_n_splits(xt)
    for ten_index, val_index in ss.split(xt):
        x_train_in, x_val_in = xt[ten_index], xt[val_index]

    px_splits = {
        'train': np.ndarray.tolist(x_train_in),
        'val': np.ndarray.tolist(x_val_in),
        'test': np.ndarray.tolist(x_test)
    }

    return px_splits


#%%

예제 #5

0

파일 보기

def train_model(label):

    label_index = labelSpace_dict[label]
    X, y = load_training_data()

    ## 将数据平均分成10份,9份用作训练,预测属性为二元属性
    rs = ShuffleSplit(n_splits=10, test_size=.1, random_state=0)
    rs.get_n_splits(X)
    X_Fold = []
    y_Fold = []
    for train_index, test_index in rs.split(X):
        x_train = []
        y_train = []
        for i in train_index:
            x_train.append(X[i])
            if y[i] == label_index:
                y_train.append(1)
            else:
                y_train.append(0)
        X_Fold.append(x_train)
        y_Fold.append(y_train)

    logre_classifier = []
    ## 训练是10个二元分类模型
    for i in range(0, X_Fold.__len__(), 1):
        classifier = LogisticRegression()
        classifier.fit(X_Fold[i], y_Fold[i])
        logre_classifier.append(classifier)
    # print logre_classifier.__len__()
    return logre_classifier

예제 #6

0

파일 보기

def my_train_test_split(data_size, test_size=0.30):
    sss = ShuffleSplit(n_splits=1, test_size=test_size)
    X = np.reshape(np.random.rand(data_size * 2), (data_size, 2))
    y = np.random.randint(2, size=data_size)
    sss.get_n_splits(X, y)
    train_index, test_index = next(sss.split(X, y))
    return train_index, test_index

예제 #7

0

파일 보기

def make_donuts(n=4000, 
                noise=0.2, 
                factor=0.5, 
                test_size=0.92, 
                nneigh=5,
                mesh=False,
                mesh_step=0.02):
    X, y = datasets.make_circles(n_samples=n, noise=0.2, factor=0.5)
    adj = make_graph(X, nneigh)
    X = StandardScaler().fit_transform(X)
    sss = ShuffleSplit(n_splits=1, test_size=test_size)
    sss.get_n_splits(X, y)
    train_index, test_index = next(sss.split(X, y)) 
    mesh_X = None
    mesh_adj = None
    xx = None
    yy = None
    if mesh:
        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step),
                             np.arange(y_min, y_max, mesh_step))
        mesh_X = np.c_[xx.ravel(), yy.ravel()]
        mesh_adj = make_graph(mesh_X, nneigh)  # Might take a long time
    mesh_pack = (mesh_adj, mesh_X, xx, yy)
    return adj, X, y, train_index, test_index, test_index, mesh_pack

예제 #8

0

파일 보기

파일: prepare_cancer_data.py 프로젝트: patriksabol/cnn-cancer-classification

def generate_k_fold_cross_valid_idx(max_idx):
    """
    generate indicies for each of the fold
    :param max_idx: how many data you have for each class, colorectal have 625
    :return:
    """
    trains = []
    valids = []
    tests = []
    y = np.arange(max_idx)
    kf = KFold(n_splits=10)
    kf.get_n_splits(y)
    for train_index, test_index in kf.split(y):
        # print("TRAIN:", len(train_index), "TEST:", len(test_index))
        yval = np.arange(len(train_index))
        kf_val = ShuffleSplit(n_splits=1, test_size=0.15)
        kf_val.get_n_splits(yval)
        for train_idx, val_idx in kf_val.split(yval):
            final_train = train_index[train_idx]
            final_val = train_index[val_idx]
            final_test = test_index
            final_train.sort()
            final_val.sort()
            # print("TRAIN:", final_train, "VALID", final_val, "TEST:", final_test)
            trains.append(final_train)
            valids.append(final_val)
            tests.append(final_test)

    return trains, valids, tests

예제 #9

0

파일 보기

파일: tool_data_processing.py 프로젝트: liu6023952/mahjong

def shuffle(path):
    """
    打乱array
    :param path: Where U put data in the dir
    :return:
    """
    X = np.loadtxt(path)
    y = X[:, -1].astype(np.int)
    X = X[:, :-1]
    rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=0)
    rs.get_n_splits(X)
    # print(rs)
    for train_index, test_index in rs.split(X, y):
        # print("Train Index:", train_index, ",Test Index:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # print(X_train,X_test,y_train,y_test)
    print("==============================")
    print("Making dataset")
    # rs = ShuffleSplit(n_splits=3, train_size=.5, test_size=.25, random_state=0)
    np.savetxt(path + '_X_train', X_train, fmt='%d')
    print(path + '_X_train')
    np.savetxt(path + '_Y_train', y_train, fmt='%d')
    print(path + '_Y_train')
    np.savetxt(path + '_X_test', X_test, fmt='%d')
    print(path + '_X_test')
    np.savetxt(path + '_Y_test', y_test, fmt='%d')
    print(path + '_Y_test')
    # return X_train, X_test, y_train, y_test
    print("==============================")
    print('FINISHED !')

예제 #10

0

파일 보기

파일: utilityvsNumofDataOwner.py 프로젝트: qianliwang/DimensionReduction

def doExp(datasetPath,
          epsilon,
          varianceRatio,
          numOfRounds,
          numOfDimensions,
          numOfSamples,
          isLinearSVM=True):
    data = np.loadtxt(datasetPath, delimiter=",")
    globalPCA = PCAModule.PCAImpl(data[:, 1:])
    numOfFeature = data.shape[1] - 1
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)
    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    xDimensions = None

    if numOfDimensions > numOfFeature:
        xDimensions = np.arange(1, numOfFeature)
        topK = numOfFeature
    else:
        xDimensions = np.arange(
            1, largestReducedFeature,
            max(largestReducedFeature / numOfDimensions, 1))
        topK = largestReducedFeature
    #cprResult = np.zeros((len(xDimensions),4));
    cprResult = None
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0)
    rs.get_n_splits(data)

    #p = Pool(numOfRounds);
    normalizedData = gf.normByRow(data[:, 1:])

    normalizedData = np.concatenate((data[:, [
        0,
    ]], normalizedData), axis=1)
    for train_index, test_index in rs.split(data):

        trainingData = normalizedData[train_index]
        testingData = normalizedData[test_index]
        #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM));
        #cprResult += tmpResult.get();
        tmpResult = singleExp(xDimensions, trainingData, testingData, topK,
                              isLinearSVM)
        if cprResult is None:
            cprResult = tmpResult
        else:
            cprResult = np.concatenate((cprResult, tmpResult), axis=0)
        """
        for i in range(0,len(cprResult)):
            print ','.join(['%.3f' % num for num in cprResult[i]]);
        """
    #avgResult = cprResult/numOfRounds;
    avgResult = cprResult
    #p.close();
    #p.join();
    for result in avgResult:
        print ','.join(['%.3f' % num for num in result])

    return avgResult

예제 #11

0

파일 보기

def ModelLearning(X, y):
    """ Calculates the performance of several models with varying sizes of training data.
        The learning and testing scores for each model are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    cv.get_n_splits(X)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int)

    # Create the figure window
    fig = pl.figure(figsize=(10, 7))

    # Create three different models based on max_depth
    for k, depth in enumerate([1, 3, 6, 10]):

        # Create a Decision tree regressor at max_depth = depth
        regressor = DecisionTreeRegressor(max_depth=depth)

        # Calculate the training and testing scores
        sizes, train_scores, test_scores = learning_curve(
            regressor, X, y, cv=cv, train_sizes=train_sizes, scoring='r2')

        # Find the mean and standard deviation for smoothing
        train_std = np.std(train_scores, axis=1)
        train_mean = np.mean(train_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)

        # Subplot the learning curve
        ax = fig.add_subplot(2, 2, k + 1)
        ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
        ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
        ax.fill_between(sizes,
                        train_mean - train_std,
                        train_mean + train_std,
                        alpha=0.15,
                        color='r')
        ax.fill_between(sizes,
                        test_mean - test_std,
                        test_mean + test_std,
                        alpha=0.15,
                        color='g')

        # Labels
        ax.set_title('max_depth = %s' % (depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0] * 0.8])
        ax.set_ylim([-0.05, 1.05])

    # Visual aesthetics
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.)
    fig.suptitle('Decision Tree Regressor Learning Performances',
                 fontsize=16,
                 y=1.03)
    fig.tight_layout()
    fig.show()

예제 #12

0

파일 보기

파일: trainer_bert_binary.py 프로젝트: Dio990521/BiLSTM_emo_classifier

def main():
    if not LOAD_TEST_SPLIT:
        global X, y
    else:
        global X_train_dev, X_test, y_train_dev, y_test

    from sklearn.model_selection import ShuffleSplit, KFold
    if not LOAD_TEST_SPLIT:
        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index
                               ], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index
                               ], [y[i] for i in test_index]

    kf = KFold(n_splits=NUM_FOLD, random_state=0)

    gold_list = None
    # all_preds = []
    for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)):
        logger('STARTING Fold -----------', i + 1)
        X_train, X_dev = [X_train_dev[i] for i in train_index
                          ], [X_train_dev[i] for i in dev_index]
        y_train, y_dev = [y_train_dev[i] for i in train_index
                          ], [y_train_dev[i] for i in dev_index]

        gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test,
                                     y_test)
        # all_preds.append(pred_list)
        break

    # all_preds = np.stack(all_preds, axis=0)

    # shape = all_preds[0].shape
    # mj = np.zeros(shape)
    # for m in range(shape[0]):
    #     for n in range(shape[1]):
    #         mj[m, n] = find_majority(np.asarray(all_preds[:, m, n]).reshape((-1)))[0]
    final_pred = pred_list

    logger('Final test by majority voting:')
    show_classification_report(gold_list, final_pred)
    metric = get_metrics(gold_list, final_pred)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_multi_metrics(gold_list, final_pred)
    logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_single_metrics(gold_list, final_pred)
    logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    logger('Jaccard:', jaccard_score(gold_list, final_pred))
    logger('Bert Binary', args)

    if args.output_path is not None:
        with open(args.output_path, 'bw') as _f:
            pkl.dump(final_pred, _f)

예제 #13

0

파일 보기

파일: split_model.py 프로젝트: Giisela/Physiological-Signal-Analysis

def ShuffleData_ecg_2(X, y):
    rs = ShuffleSplit(n_splits=30, test_size=0.25, random_state=42)
    rs.get_n_splits(X)
    for train_index, test_index in rs.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    return X_train, X_test, y_train, y_test

예제 #14

0

파일 보기

def rf_allMix(X, y, path, title, n_estimators, max_depth):
    # Create a Gaussian Classifier
    acc_app = []
    precision_app = []
    recall_app = []
    f1_score_app = []
    mean_absolut = []

    y_testing = []

    y_prediction = []
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_features=len(X[0]),
                                 n_jobs=-1,
                                 max_depth=max_depth)

    rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=42)
    rs.get_n_splits(X)
    for train_index, test_index in rs.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        # Train the model using the training sets y_pred=clf.predict(X_test)
        clf.fit(X_train, y_train)
        # prediction on test set
        y_pred = clf.predict(X_test)
        y_prediction.extend(y_pred)
        y_testing.extend(y_test)

        accuracy = metrics.accuracy_score(y_test, y_pred)
        acc_app.append(accuracy)

        precision = metrics.precision_score(y_test, y_pred, average='micro')
        precision_app.append(precision)

        recall = metrics.recall_score(y_test, y_pred, average='micro')
        recall_app.append(recall)

        f1_score = metrics.f1_score(y_test, y_pred, average='micro')
        f1_score_app.append(f1_score)

        mean_absolut_error = metrics.mean_absolute_error(y_test, y_pred)
        mean_absolut.append(mean_absolut_error)

        performance_every_shuffler_allmix(y_pred, y_test, accuracy, precision,
                                          recall, f1_score, mean_absolut_error,
                                          path, title)

    ##################################################
    performance_global_shuffle_allmix(y_prediction, y_testing, acc_app,
                                      precision_app, recall_app, f1_score_app,
                                      mean_absolut, path, title)

예제 #15

0

파일 보기

def main():
    if not LOAD_TEST_SPLIT:
        global X, y
        ALL_TRAINING = X
    else:
        global X_train_dev, X_test, y_train_dev, y_test
        ALL_TRAINING = X_train_dev + X_test
    glove_tokenizer.build_tokenizer(ALL_TRAINING, vocab_size=VOCAB_SIZE)
    glove_tokenizer.build_embedding(GLOVE_EMB_PATH, dataset_name=data_set_name)

    from sklearn.model_selection import ShuffleSplit, KFold

    if not LOAD_TEST_SPLIT:
        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

    kf = KFold(n_splits=args.folds, random_state=args.dev_split_seed)
    # kf.get_n_splits(X_train_dev)

    all_preds = []
    gold_list = None

    for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)):
        logger('STARTING Fold -----------', i + 1)
        X_train, X_dev = [X_train_dev[i] for i in train_index], [X_train_dev[i] for i in dev_index]
        y_train, y_dev = [y_train_dev[i] for i in train_index], [y_train_dev[i] for i in dev_index]

        gold_list, pred_list, model = train(X_train, y_train, X_dev, y_dev, X_test, y_test)
        all_preds.append(pred_list)
        #torch.save(model.state_dict(), 'saved_model/emotion_classifier' + str(i+1) + '.pt')
        #break
    all_preds = np.stack(all_preds, axis=0)
    shape = all_preds[0].shape
    mj = np.zeros(shape[0])
    for m in range(shape[0]):
        mj[m] = find_majority(np.asarray(all_preds[:, m]).reshape((-1)))[0]

    final_pred = mj

    print('TEST---------: ')
    show_classification_report(gold_list, final_pred)
    metric = get_metrics(gold_list, final_pred)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])
    # metric = get_multi_metrics(gold_list, final_pred)
    # logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])
    # metric = get_single_metrics(gold_list, final_pred)
    # logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])

    # logger('Final Jaccard:', jaccard_score(gold_list, final_pred))
    logger(os.path.basename(__file__))
    logger(args)

예제 #16

0

파일 보기

파일: newxgboost.py 프로젝트: zhudaxia666/alicomp

def multiTrain(data, label, test_data):

    best_train_acc = 0
    best_val_acc = 0
    result_list = []
    train_acc_list = []
    val_acc_list = []
    best_epoch = None
    final_result = []
    splitstate = ShuffleSplit(n_splits=18, test_size=.20)
    splitstate.get_n_splits(data, label)
    epoch = 0
    time_str = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time()))
    for train_index, val_index in splitstate.split(data, label):
        print("epoch: ", epoch + 1)
        epoch += 1
        print("TRAIN:", train_index, "TEST: ", val_index)
        sub_train_data = np.array([train_data[i] for i in train_index])
        sub_train_label = np.array([train_label[i] for i in train_index])
        sub_val_data = np.array([train_data[i] for i in val_index])
        sub_val_label = np.array([train_label[i] for i in val_index])
        train_acc, val_acc, test_prediction = xgbmodelc(
            sub_train_data, sub_train_label, sub_val_data, sub_val_label,
            test_data)
        if val_acc > best_val_acc:
            print("find a better val_acc: " + str(best_val_acc) + " -> " +
                  str(val_acc))
            best_train_acc = train_acc
            best_val_acc = val_acc
            best_epoch = epoch
        result_list.append(test_prediction)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)

    print("best_epoch: {}".format(epoch))
    best_result = result_list[epoch - 1]
    best_result_save_name = "./best/best_result_" + time_str + ".csv"
    save_result(best_result_save_name, best_result)
    print(best_val_acc)
    print(train_acc_list)
    result_list = np.array(result_list)
    for i in range(17):
        counts = np.bincount(result_list[:, i])
        index = np.argmax(counts)
        final_result.append(index)
    final_result = np.array(final_result)
    one_hots = to_categorical(final_result).astype(np.int32)
    csvfile = open('rank_result_1.csv', 'w', newline='')
    writer = csv.writer(csvfile)
    for i in one_hots:
        writer.writerow(i)
    csvfile.close()
    print("预测完毕!")

예제 #17

0

파일 보기

파일: ml_functions.py 프로젝트: rafaelmgr12/ds-projects

def tts_split(X, y, size, splits):
    '''Split the data in Train and
     test using the Shuffle split'''

    rs = ShuffleSplit(n_splits=splits, test_size=size)

    rs.get_n_splits(X)

    for train_index, test_index in rs.split(X, y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return X_train, X_test, y_train, y_test

예제 #18

0

파일 보기

def plot_learning_performance(regressor, X, y):
    """
    Draw a graph that visualizes the learning curves of the model for both
     training and testing as the size of the training set is increased. 
     
     Note that the shaded region of a learning curve denotes the uncertainty 
     of that curve (measured as the standard deviation). 

     The model is scored on both the training and testing sets using R2, the coefficient of determination.
    """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    cv.get_n_splits(X)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int)

    # Calculate the training and testing scores
    sizes, train_scores, test_scores = learning_curve(regressor, X, y, \
        cv = cv, train_sizes = train_sizes, scoring = 'r2')

    # Find the mean and standard deviation for smoothing
    train_std = np.std(train_scores, axis=1)
    train_mean = np.mean(train_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)

    from matplotlib.pyplot import figure
    figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')

    plt.title('')
    plt.xlabel('Number of Training Points')
    plt.ylabel('r2 score')
    plt.xlim([0, X.shape[0] * 0.8])
    plt.ylim([-0.05, 1.05])

    plt.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
    plt.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
    plt.fill_between(sizes, train_mean - train_std, \
        train_mean + train_std, alpha = 0.15, color = 'r')
    plt.fill_between(sizes, test_mean - test_std, \
        test_mean + test_std, alpha = 0.15, color = 'g')

    # Visual aesthetics
    plt.legend(bbox_to_anchor=(0.4, 1.3), loc='lower left', borderaxespad=0.)

    plt.suptitle(type(regressor).__name__ + ' Learning Performances',
                 fontsize=16,
                 y=1.03)
    plt.show()

예제 #19

0

파일 보기

파일: utilityvsNumofSamples.py 프로젝트: qianliwang/DimensionReduction

def doExp(datasetPath,
          epsilon,
          varianceRatio,
          numOfRounds,
          numOfPointsinXAxis,
          isLinearSVM=True):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath)
    else:
        data = np.loadtxt(datasetPath, delimiter=",")
    numOfFeature = data.shape[1] - 1
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data[:, 1:])
    globalPCA = PCAImpl(data_std)
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)

    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    cprResult = None

    #rs = StratifiedShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0);
    #rs.get_n_splits(data[:,1:],data[:,0]);
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0)
    rs.get_n_splits(data)
    for train_index, test_index in rs.split(data):
        #for train_index, test_index in rs.split(data[:,1:],data[:,0]):

        trainingData = data[train_index]
        testingData = data[test_index]
        print "number of training samples %d" % trainingData.shape[0]
        #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM));
        #cprResult += tmpResult.get();
        mostSamplesPerDataOwner = trainingData.shape[0] / 2
        xSamples = np.arange(
            2, mostSamplesPerDataOwner,
            max(mostSamplesPerDataOwner / numOfPointsinXAxis, 1))
        print "number of samples be tested: %s" % xSamples
        tmpResult = singleExp(xSamples, trainingData, testingData,
                              largestReducedFeature, epsilon, isLinearSVM)
        if cprResult is None:
            cprResult = tmpResult
        else:
            cprResult = np.concatenate((cprResult, tmpResult), axis=0)

    for result in cprResult:
        print ','.join(['%.3f' % num for num in result])

    return cprResult

예제 #20

0

파일 보기

def bootstrap_runner(run_name):
    selected_efps = pd.read_csv(
        path / "results" / run_name / "selected_efps.csv"
    )
    selected_efps = selected_efps.efp.tolist()[1:]
    X, y = grab_and_mix_data(selected_efps)

    n = len(y)
    n_train = int(0.85 * n)
    n_test = int(0.15 * n)
    rs = ShuffleSplit(n_splits=n_splits, random_state=0, test_size=0.15)
    rs.get_n_splits(X)

    ShuffleSplit(n_splits=n_splits, random_state=0, test_size=0.15)
    straps = []
    aucs = []
    bs_count = 0
    for train_index, test_index in rs.split(X):
        X_train = X[train_index]
        y_train = y[train_index]
        X_val = X[test_index]
        y_val = y[test_index]
        model_file = f"{bs_model_dir}/bs-{bs_count}.h5"
        if not os.path.isfile(model_file):
            model = nn(
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,
                y_val=y_val,
                epochs=epochs,
                batch_size=batch_size,
                layers=layers,
                nodes=nodes,
                model_file=model_file,
                verbose=0,
            )
        else:
            model = tf.keras.models.load_model(model_file)

        auc_val = roc_auc_score(y_val, np.hstack(model.predict(X_val)))
        # print(f"    test-set AUC: {auc_val:.5}")
        straps.append(bs_count)
        aucs.append(auc_val)
        results = pd.DataFrame({"bs": straps, "auc": aucs})
        results.to_csv(path / "results" / run_name / "bootstrap_results.csv")
        bs_count += 1
        auc_mean = np.average(aucs)
        auc_std = np.std(aucs)
        print(f"AUC = {auc_mean:.5f} +/- {auc_std:.5f}")

예제 #21

0

파일 보기

def ModelComplexity(X, y):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    cv.get_n_splits(X)

    # Vary the max_depth parameter from 1 to 10
    max_depth = np.arange(1, 11)

    # Calculate the training and testing scores
    train_scores, test_scores = validation_curve(DecisionTreeRegressor(),
                                                 X,
                                                 y,
                                                 param_name="max_depth",
                                                 param_range=max_depth,
                                                 cv=cv,
                                                 scoring='r2')

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score')
    pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score')
    pl.fill_between(max_depth,
                    train_mean - train_std,
                    train_mean + train_std,
                    alpha=0.15,
                    color='r')
    pl.fill_between(max_depth,
                    test_mean - test_std,
                    test_mean + test_std,
                    alpha=0.15,
                    color='g')

    # Visual aesthetics
    pl.legend(loc='lower right')
    pl.xlabel('Maximum Depth')
    pl.ylabel('Score')
    pl.ylim([-0.05, 1.05])
    pl.show()

예제 #22

0

파일 보기

def doExp(datasetPath,varianceRatio,numOfRounds):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath);
    else:
        data = np.loadtxt(datasetPath, delimiter=",");

    rs = ShuffleSplit(n_splits=numOfRounds, test_size=2, random_state=0);
    rs.get_n_splits(data);
    globalPCA = PCAImpl(data[:, 1:]);
    numOfFeature = data.shape[1] - 1;
    matrixRank = LA.matrix_rank(data[:, 1:]);

    print "Matrix rank of the data is %d." % matrixRank;
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(varianceRatio);
    print "%d/%d dimensions captures %.2f variance." % (largestReducedFeature, numOfFeature, varianceRatio);

    xEpsilons = np.arange(0.1, 1.1, 0.1);
    # print xDimensions;
    # p = Pool(numOfRounds);
    # allResults = [];
    cprResult = [];
    m = 0;
    for train_index, test_index in rs.split(data):
        print "Trail %d" % m;
        trainingData = data[train_index];
        pureTrainingData = trainingData[:, 1:];
        tmpResult = singleExp(xEpsilons, pureTrainingData, largestReducedFeature);
        cprResult.extend(tmpResult);
        m += 1;
        # print tmpResult.shape;
        # print tmpResult;
        # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature));
        # cprResult += tmpResult.get();
    """
        for i in range(0,len(cprResult)):
            print "%.4f,%.4f,%.4f" % (cprResult[i][0],cprResult[i][1],cprResult[i][2]);
        print "******************************";
    """
    # Compute the average value after numOfRounds experiments.
    # avgCprResult = cprResult/numOfRounds;
    # p.close();
    # p.join();
    for result in cprResult:
        print ','.join(['%.3f' % num for num in result]);

    return np.asarray(cprResult, dtype=float);

예제 #23

0

파일 보기

def run_SVM(x, y):
    '''
    run cross validated SVM regression
    :param x: feature vectors
    :param y: labels
    :return: None
    '''
    print 'SVM: '
    rs = ShuffleSplit(n_splits=5, test_size=.20)
    rs.get_n_splits(x)
    split = 0
    for train_index, test_index in rs.split(x):
        print "split", split
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_svm(x_train, x_test, y_train, y_test)
        split += 1

예제 #24

0

파일 보기

def trainTestSplit(x_data, y_data):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    rs = ShuffleSplit(n_splits=1,
                      train_size=0.7,
                      test_size=0.3,
                      random_state=0)
    rs.get_n_splits(x_data)

    for train_index, test_index in rs.split(x_data, y_data):

        X_train, X_test = x_data[train_index], x_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]

    return X_train, y_train, X_test, y_test

예제 #25

0

파일 보기

파일: logreg.py 프로젝트: heyyjudes/anxiety-on-reddit

def run_LR(x, y):
    '''
    run cross validated logistic regression
    :param x: feature vectors
    :param y: labels
    :return: None
    '''
    rs = ShuffleSplit(n_splits=5, test_size=.20)
    rs.get_n_splits(x)
    print 'Logistic Regression: '
    split = 0
    for train_index, test_index in rs.split(x):
        print "split", split
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        acc, per, recall = run_logreg(x_train, x_test, y_train, y_test)
        split += 1
    return

예제 #26

0

파일 보기

파일: cross.py 프로젝트: AojiXie/Big-Data-Analytics-for-Healthcare

def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2):
	#TODO: First get the train indices and test indices for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the iterations
	sskf = ShuffleSplit(n_splits=iterNo, test_size=test_percent, random_state=RANDOM_STATE)
	sskf.get_n_splits(X)
	accuracies = []
	aucs = []

	for train_index, test_index in sskf.split(X):
		Y_pred = models_partc.logistic_regression_pred(X[train_index], Y[train_index], X[test_index])
		#Y_pred = my_model.my_classifier_predictions(X[train_index],Y[train_index],X[test_index])
		accuracy = accuracy_score(Y_pred, Y[test_index])
		auc_score = roc_auc_score(sorted(Y_pred), sorted(Y[test_index]))
		accuracies.append(accuracy)
		aucs.append(auc_score)

	return np.mean(accuracies), np.mean(aucs)

예제 #27

0

파일 보기

파일: main.py 프로젝트: lfcunha/mushroom-classification

def fit_model(X, y):
    rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)
    cv_sets = rs.get_n_splits(X)
    classifier = DecisionTreeClassifier(random_state=0)
    params = {"max_depth": range(1, 11)}
    scoring_fnc = make_scorer(performance_metric)
    grid = GridSearchCV(classifier, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(X, y)
    print(pd.DataFrame(grid.cv_results_))
    return grid.best_estimator_

예제 #28

0

파일 보기

def step_first_train(classfiers_dict):

    ## 将数据分成1份训练
    X, y = load_training_data()

    rs = ShuffleSplit(n_splits=1, test_size=.1, random_state=0)
    rs.get_n_splits(X)
    x_train = []
    y_train = []
    X_test = []
    y_test = []
    for train_index, test_index in rs.split(X):
        for i in train_index:
            x_train.append(X[i])
            y_train.append(y[i])
        for i in test_index:
            X_test.append(X[i])
            y_test.append(y[i])

    step1_X = x_train
    step1_y = y_train

    ## 对每个类别作概率预测
    new_X = []
    for i in range(0, step1_X.__len__(), 1):
        pro_X = [0 * j for j in range(0, 24, 1)]
        for label in labelSpace_dict:
            label_index = labelSpace_dict[label]
            pro_total = 0.0
            logre_classifier = classfiers_dict[label]
            for t in range(0, logre_classifier.__len__(), 1):
                clf = logre_classifier[t]
                # print step1_X[i]
                pro = clf.predict_proba(step1_X[i])
                # print pro[0][1]
                pro_total += pro[0][1]
            pro_total = pro_total / logre_classifier.__len__()
            pro_X[label_index] = pro_total
        # print pro_X
        new_X.append(pro_X)

    return new_X, step1_y

예제 #29

0

파일 보기

파일: kNN.py 프로젝트: Cobb141/machine-learning

def test():
    filename=input("输入数据所在的文本文件的路径：")
    X,y=fileload(filename)
    X=norm(X)
    rs=ShuffleSplit(n_splits=3,test_size=.3,random_state=0)  #划分测试集，训练集
    rs.get_n_splits(X)
    for train_index,test_index in rs.split(X,y):    
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
    group,labels=X_train,y_train
    m=np.shape(X_test)[0]
    result=[]
    for i in range(m):
        result.append(classify0(X_test[i],group,labels,50))
    error=0
    print("分类错误样本：")
    for i in range(45):
        if result[i]!=y_test[i]:
            error+=1
            print(X_test[i])
    print("错误率：")
    return error/m

예제 #30

0

파일 보기

파일: data_loader.py 프로젝트: Dio990521/SA_emo_dialog

def load_BMET_data(for_seq2emo=True, load_split=False):
    EMOS = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'thankfulness']
    EMOS_DIC = {}
    for idx, emo in enumerate(EMOS):
        EMOS_DIC[emo] = idx
    # data_pata = 'data/EmoSet_RemoveDup_GloveProcess_OneEmo.csv'
    data_pata = 'data/BMETv0.3.csv'
    df_data = pd.read_csv(data_pata)

    # extract the subset which only contains the full sentences.
    source = []
    target = []
    for index, row in df_data.iterrows():
        next_token = str(row['text']).strip().split()
        if len(next_token) > MAX_LEN_DATA:
            next_token = next_token[:MAX_LEN_DATA]
        source.append(' '.join(next_token))
        if for_seq2emo:
            a_target = [0, 2, 4, 6, 8, 10]
            label = row['label'].split()
            for emo in label:
                a_target[EMOS_DIC[emo]] = EMOS_DIC[emo] * 2 + 1
        else:
            a_target = [0] * len(EMOS)
            label = row['label'].split()
            for emo in label:
                a_target[EMOS_DIC[emo]] = 1
        target.append(a_target)
    if not load_split:
        return source, target, EMOS, EMOS_DIC, 'BMETv0.3'
    else:
        from sklearn.model_selection import ShuffleSplit
        X, y = source, target
        ss = ShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    return X_train_dev, y_train_dev, X_test, y_test, EMOS, EMOS_DIC, 'BMETv0.3'

예제 #31

0

파일 보기

파일: i6sklearn shufflesplit demo.py 프로젝트: greatabel/MachineLearning

from sklearn.model_selection import ShuffleSplit
import numpy as np


X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)
print(rs)
ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                     random_state=0)

for train_index, test_index in rs.split(X):
    print("# TRAIN:", train_index, "TEST:", test_index)