예제 #1
0
def doExp(datasetPath, targetEpsilon, numOfRounds):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath);
    else:
        data = np.loadtxt(datasetPath, delimiter=",");
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=1, random_state=0);
    rs.get_n_splits(data);
    print "Samples: %d, Features: %d" % (data.shape[0],data.shape[1]-1);

    # p = Pool(numOfRounds);
    cprResult = [];
    m = 0;
    for train_index, test_index in rs.split(data):
        print "Trail %d" % m;
        trainingData = data[train_index,1:];
        tmpResult = singleExp(trainingData,targetEpsilon);
        cprResult.extend(tmpResult);
        m += 1;
        # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature));
        # cprResult += tmpResult.get();

    # Compute the average value after numOfRounds experiments.
    # p.close();
    # p.join();

    return cprResult;
예제 #2
0
def get_features_importance(X, y, variables):
    """
Run random forest on data, multiple times. Look for loss of f1_score for each variable.
Return sorted list of most important variables.
    :param X: train array
    :param y: target array
    :param variables: list of variables to look at, in right order. See ComputeFeatures.handled_variables.
    :return: dictionary with list of f1_score for each variable.
    """

    sp = ShuffleSplit(n_splits=5, test_size=.2)
    sp.get_n_splits(X)

    scores = defaultdict(list)

    names = variables

    for train_idx, test_idx in sp.split(X):
        x_train, x_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model = DecisionTreeClassifier()
        model.fit(x_train, y_train)
        acc = f1_score(y_test, model.predict(x_test))
        for i in range(X.shape[1]):
            X_t = x_test.copy()
            np.random.shuffle(X_t[:, i])
            shuff_acc = f1_score(y_test, model.predict(X_t))
            scores[names[i]].append((acc - shuff_acc) / acc)
    print("Features sorted by their score:")
    print(
        sorted([(np.round(np.mean(score), 4), feat)
                for feat, score in scores.items()],
               reverse=True))
    return scores
예제 #3
0
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)
    cv_sets.get_n_splits(X)
    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': [1, 2, 4, 5, 6, 7, 8, 9, 10]}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search cv object --> GridSearchCV()
    # Make sure to include the right parameters in the object:
    # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
    grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
예제 #4
0
def split_data_train_val_test(px_fol):

    patients = os.listdir(px_fol)
    patients = np.asarray(patients)

    ss = ShuffleSplit(n_splits=1, test_size=0.20)
    ss.get_n_splits(patients)
    for train_index, test_index in ss.split(patients):
        xt, x_test = patients[train_index], patients[test_index]

    ss = ShuffleSplit(n_splits=1, test_size=0.20)
    ss.get_n_splits(xt)
    for ten_index, val_index in ss.split(xt):
        x_train_in, x_val_in = xt[ten_index], xt[val_index]

    px_splits = {
        'train': np.ndarray.tolist(x_train_in),
        'val': np.ndarray.tolist(x_val_in),
        'test': np.ndarray.tolist(x_test)
    }

    return px_splits


#%%
예제 #5
0
def train_model(label):

    label_index = labelSpace_dict[label]
    X, y = load_training_data()

    ## 将数据平均分成10份,9份用作训练,预测属性为二元属性
    rs = ShuffleSplit(n_splits=10, test_size=.1, random_state=0)
    rs.get_n_splits(X)
    X_Fold = []
    y_Fold = []
    for train_index, test_index in rs.split(X):
        x_train = []
        y_train = []
        for i in train_index:
            x_train.append(X[i])
            if y[i] == label_index:
                y_train.append(1)
            else:
                y_train.append(0)
        X_Fold.append(x_train)
        y_Fold.append(y_train)

    logre_classifier = []
    ## 训练是10个二元分类模型
    for i in range(0, X_Fold.__len__(), 1):
        classifier = LogisticRegression()
        classifier.fit(X_Fold[i], y_Fold[i])
        logre_classifier.append(classifier)
    # print logre_classifier.__len__()
    return logre_classifier
예제 #6
0
def my_train_test_split(data_size, test_size=0.30):
    sss = ShuffleSplit(n_splits=1, test_size=test_size)
    X = np.reshape(np.random.rand(data_size * 2), (data_size, 2))
    y = np.random.randint(2, size=data_size)
    sss.get_n_splits(X, y)
    train_index, test_index = next(sss.split(X, y))
    return train_index, test_index
예제 #7
0
def make_donuts(n=4000, 
                noise=0.2, 
                factor=0.5, 
                test_size=0.92, 
                nneigh=5,
                mesh=False,
                mesh_step=0.02):
    X, y = datasets.make_circles(n_samples=n, noise=0.2, factor=0.5)
    adj = make_graph(X, nneigh)
    X = StandardScaler().fit_transform(X)
    sss = ShuffleSplit(n_splits=1, test_size=test_size)
    sss.get_n_splits(X, y)
    train_index, test_index = next(sss.split(X, y)) 
    mesh_X = None
    mesh_adj = None
    xx = None
    yy = None
    if mesh:
        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step),
                             np.arange(y_min, y_max, mesh_step))
        mesh_X = np.c_[xx.ravel(), yy.ravel()]
        mesh_adj = make_graph(mesh_X, nneigh)  # Might take a long time
    mesh_pack = (mesh_adj, mesh_X, xx, yy)
    return adj, X, y, train_index, test_index, test_index, mesh_pack
def generate_k_fold_cross_valid_idx(max_idx):
    """
    generate indicies for each of the fold
    :param max_idx: how many data you have for each class, colorectal have 625
    :return:
    """
    trains = []
    valids = []
    tests = []
    y = np.arange(max_idx)
    kf = KFold(n_splits=10)
    kf.get_n_splits(y)
    for train_index, test_index in kf.split(y):
        # print("TRAIN:", len(train_index), "TEST:", len(test_index))
        yval = np.arange(len(train_index))
        kf_val = ShuffleSplit(n_splits=1, test_size=0.15)
        kf_val.get_n_splits(yval)
        for train_idx, val_idx in kf_val.split(yval):
            final_train = train_index[train_idx]
            final_val = train_index[val_idx]
            final_test = test_index
            final_train.sort()
            final_val.sort()
            # print("TRAIN:", final_train, "VALID", final_val, "TEST:", final_test)
            trains.append(final_train)
            valids.append(final_val)
            tests.append(final_test)

    return trains, valids, tests
예제 #9
0
def shuffle(path):
    """
    打乱array
    :param path: Where U put data in the dir
    :return:
    """
    X = np.loadtxt(path)
    y = X[:, -1].astype(np.int)
    X = X[:, :-1]
    rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=0)
    rs.get_n_splits(X)
    # print(rs)
    for train_index, test_index in rs.split(X, y):
        # print("Train Index:", train_index, ",Test Index:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # print(X_train,X_test,y_train,y_test)
    print("==============================")
    print("Making dataset")
    # rs = ShuffleSplit(n_splits=3, train_size=.5, test_size=.25, random_state=0)
    np.savetxt(path + '_X_train', X_train, fmt='%d')
    print(path + '_X_train')
    np.savetxt(path + '_Y_train', y_train, fmt='%d')
    print(path + '_Y_train')
    np.savetxt(path + '_X_test', X_test, fmt='%d')
    print(path + '_X_test')
    np.savetxt(path + '_Y_test', y_test, fmt='%d')
    print(path + '_Y_test')
    # return X_train, X_test, y_train, y_test
    print("==============================")
    print('FINISHED !')
def doExp(datasetPath,
          epsilon,
          varianceRatio,
          numOfRounds,
          numOfDimensions,
          numOfSamples,
          isLinearSVM=True):
    data = np.loadtxt(datasetPath, delimiter=",")
    globalPCA = PCAModule.PCAImpl(data[:, 1:])
    numOfFeature = data.shape[1] - 1
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)
    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    xDimensions = None

    if numOfDimensions > numOfFeature:
        xDimensions = np.arange(1, numOfFeature)
        topK = numOfFeature
    else:
        xDimensions = np.arange(
            1, largestReducedFeature,
            max(largestReducedFeature / numOfDimensions, 1))
        topK = largestReducedFeature
    #cprResult = np.zeros((len(xDimensions),4));
    cprResult = None
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0)
    rs.get_n_splits(data)

    #p = Pool(numOfRounds);
    normalizedData = gf.normByRow(data[:, 1:])

    normalizedData = np.concatenate((data[:, [
        0,
    ]], normalizedData), axis=1)
    for train_index, test_index in rs.split(data):

        trainingData = normalizedData[train_index]
        testingData = normalizedData[test_index]
        #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM));
        #cprResult += tmpResult.get();
        tmpResult = singleExp(xDimensions, trainingData, testingData, topK,
                              isLinearSVM)
        if cprResult is None:
            cprResult = tmpResult
        else:
            cprResult = np.concatenate((cprResult, tmpResult), axis=0)
        """
        for i in range(0,len(cprResult)):
            print ','.join(['%.3f' % num for num in cprResult[i]]);
        """
    #avgResult = cprResult/numOfRounds;
    avgResult = cprResult
    #p.close();
    #p.join();
    for result in avgResult:
        print ','.join(['%.3f' % num for num in result])

    return avgResult
예제 #11
0
def ModelLearning(X, y):
    """ Calculates the performance of several models with varying sizes of training data.
        The learning and testing scores for each model are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    cv.get_n_splits(X)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int)

    # Create the figure window
    fig = pl.figure(figsize=(10, 7))

    # Create three different models based on max_depth
    for k, depth in enumerate([1, 3, 6, 10]):

        # Create a Decision tree regressor at max_depth = depth
        regressor = DecisionTreeRegressor(max_depth=depth)

        # Calculate the training and testing scores
        sizes, train_scores, test_scores = learning_curve(
            regressor, X, y, cv=cv, train_sizes=train_sizes, scoring='r2')

        # Find the mean and standard deviation for smoothing
        train_std = np.std(train_scores, axis=1)
        train_mean = np.mean(train_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)

        # Subplot the learning curve
        ax = fig.add_subplot(2, 2, k + 1)
        ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
        ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
        ax.fill_between(sizes,
                        train_mean - train_std,
                        train_mean + train_std,
                        alpha=0.15,
                        color='r')
        ax.fill_between(sizes,
                        test_mean - test_std,
                        test_mean + test_std,
                        alpha=0.15,
                        color='g')

        # Labels
        ax.set_title('max_depth = %s' % (depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0] * 0.8])
        ax.set_ylim([-0.05, 1.05])

    # Visual aesthetics
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.)
    fig.suptitle('Decision Tree Regressor Learning Performances',
                 fontsize=16,
                 y=1.03)
    fig.tight_layout()
    fig.show()
def main():
    if not LOAD_TEST_SPLIT:
        global X, y
    else:
        global X_train_dev, X_test, y_train_dev, y_test

    from sklearn.model_selection import ShuffleSplit, KFold
    if not LOAD_TEST_SPLIT:
        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index
                               ], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index
                               ], [y[i] for i in test_index]

    kf = KFold(n_splits=NUM_FOLD, random_state=0)

    gold_list = None
    # all_preds = []
    for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)):
        logger('STARTING Fold -----------', i + 1)
        X_train, X_dev = [X_train_dev[i] for i in train_index
                          ], [X_train_dev[i] for i in dev_index]
        y_train, y_dev = [y_train_dev[i] for i in train_index
                          ], [y_train_dev[i] for i in dev_index]

        gold_list, pred_list = train(X_train, y_train, X_dev, y_dev, X_test,
                                     y_test)
        # all_preds.append(pred_list)
        break

    # all_preds = np.stack(all_preds, axis=0)

    # shape = all_preds[0].shape
    # mj = np.zeros(shape)
    # for m in range(shape[0]):
    #     for n in range(shape[1]):
    #         mj[m, n] = find_majority(np.asarray(all_preds[:, m, n]).reshape((-1)))[0]
    final_pred = pred_list

    logger('Final test by majority voting:')
    show_classification_report(gold_list, final_pred)
    metric = get_metrics(gold_list, final_pred)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_multi_metrics(gold_list, final_pred)
    logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    metric = get_single_metrics(gold_list, final_pred)
    logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F',
           metric[4])
    logger('Jaccard:', jaccard_score(gold_list, final_pred))
    logger('Bert Binary', args)

    if args.output_path is not None:
        with open(args.output_path, 'bw') as _f:
            pkl.dump(final_pred, _f)
def ShuffleData_ecg_2(X, y):
    rs = ShuffleSplit(n_splits=30, test_size=0.25, random_state=42)
    rs.get_n_splits(X)
    for train_index, test_index in rs.split(X):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    return X_train, X_test, y_train, y_test
예제 #14
0
def rf_allMix(X, y, path, title, n_estimators, max_depth):
    # Create a Gaussian Classifier
    acc_app = []
    precision_app = []
    recall_app = []
    f1_score_app = []
    mean_absolut = []

    y_testing = []

    y_prediction = []
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_features=len(X[0]),
                                 n_jobs=-1,
                                 max_depth=max_depth)

    rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=42)
    rs.get_n_splits(X)
    for train_index, test_index in rs.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        # Train the model using the training sets y_pred=clf.predict(X_test)
        clf.fit(X_train, y_train)
        # prediction on test set
        y_pred = clf.predict(X_test)
        y_prediction.extend(y_pred)
        y_testing.extend(y_test)

        accuracy = metrics.accuracy_score(y_test, y_pred)
        acc_app.append(accuracy)

        precision = metrics.precision_score(y_test, y_pred, average='micro')
        precision_app.append(precision)

        recall = metrics.recall_score(y_test, y_pred, average='micro')
        recall_app.append(recall)

        f1_score = metrics.f1_score(y_test, y_pred, average='micro')
        f1_score_app.append(f1_score)

        mean_absolut_error = metrics.mean_absolute_error(y_test, y_pred)
        mean_absolut.append(mean_absolut_error)

        performance_every_shuffler_allmix(y_pred, y_test, accuracy, precision,
                                          recall, f1_score, mean_absolut_error,
                                          path, title)

    ##################################################
    performance_global_shuffle_allmix(y_prediction, y_testing, acc_app,
                                      precision_app, recall_app, f1_score_app,
                                      mean_absolut, path, title)
예제 #15
0
def main():
    if not LOAD_TEST_SPLIT:
        global X, y
        ALL_TRAINING = X
    else:
        global X_train_dev, X_test, y_train_dev, y_test
        ALL_TRAINING = X_train_dev + X_test
    glove_tokenizer.build_tokenizer(ALL_TRAINING, vocab_size=VOCAB_SIZE)
    glove_tokenizer.build_embedding(GLOVE_EMB_PATH, dataset_name=data_set_name)

    from sklearn.model_selection import ShuffleSplit, KFold

    if not LOAD_TEST_SPLIT:
        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

    kf = KFold(n_splits=args.folds, random_state=args.dev_split_seed)
    # kf.get_n_splits(X_train_dev)

    all_preds = []
    gold_list = None

    for i, (train_index, dev_index) in enumerate(kf.split(y_train_dev)):
        logger('STARTING Fold -----------', i + 1)
        X_train, X_dev = [X_train_dev[i] for i in train_index], [X_train_dev[i] for i in dev_index]
        y_train, y_dev = [y_train_dev[i] for i in train_index], [y_train_dev[i] for i in dev_index]

        gold_list, pred_list, model = train(X_train, y_train, X_dev, y_dev, X_test, y_test)
        all_preds.append(pred_list)
        #torch.save(model.state_dict(), 'saved_model/emotion_classifier' + str(i+1) + '.pt')
        #break
    all_preds = np.stack(all_preds, axis=0)
    shape = all_preds[0].shape
    mj = np.zeros(shape[0])
    for m in range(shape[0]):
        mj[m] = find_majority(np.asarray(all_preds[:, m]).reshape((-1)))[0]

    final_pred = mj

    print('TEST---------: ')
    show_classification_report(gold_list, final_pred)
    metric = get_metrics(gold_list, final_pred)
    logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])
    # metric = get_multi_metrics(gold_list, final_pred)
    # logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])
    # metric = get_single_metrics(gold_list, final_pred)
    # logger('Single only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])

    # logger('Final Jaccard:', jaccard_score(gold_list, final_pred))
    logger(os.path.basename(__file__))
    logger(args)
예제 #16
0
def multiTrain(data, label, test_data):

    best_train_acc = 0
    best_val_acc = 0
    result_list = []
    train_acc_list = []
    val_acc_list = []
    best_epoch = None
    final_result = []
    splitstate = ShuffleSplit(n_splits=18, test_size=.20)
    splitstate.get_n_splits(data, label)
    epoch = 0
    time_str = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time()))
    for train_index, val_index in splitstate.split(data, label):
        print("epoch: ", epoch + 1)
        epoch += 1
        print("TRAIN:", train_index, "TEST: ", val_index)
        sub_train_data = np.array([train_data[i] for i in train_index])
        sub_train_label = np.array([train_label[i] for i in train_index])
        sub_val_data = np.array([train_data[i] for i in val_index])
        sub_val_label = np.array([train_label[i] for i in val_index])
        train_acc, val_acc, test_prediction = xgbmodelc(
            sub_train_data, sub_train_label, sub_val_data, sub_val_label,
            test_data)
        if val_acc > best_val_acc:
            print("find a better val_acc: " + str(best_val_acc) + " -> " +
                  str(val_acc))
            best_train_acc = train_acc
            best_val_acc = val_acc
            best_epoch = epoch
        result_list.append(test_prediction)
        train_acc_list.append(train_acc)
        val_acc_list.append(val_acc)

    print("best_epoch: {}".format(epoch))
    best_result = result_list[epoch - 1]
    best_result_save_name = "./best/best_result_" + time_str + ".csv"
    save_result(best_result_save_name, best_result)
    print(best_val_acc)
    print(train_acc_list)
    result_list = np.array(result_list)
    for i in range(17):
        counts = np.bincount(result_list[:, i])
        index = np.argmax(counts)
        final_result.append(index)
    final_result = np.array(final_result)
    one_hots = to_categorical(final_result).astype(np.int32)
    csvfile = open('rank_result_1.csv', 'w', newline='')
    writer = csv.writer(csvfile)
    for i in one_hots:
        writer.writerow(i)
    csvfile.close()
    print("预测完毕!")
예제 #17
0
def tts_split(X, y, size, splits):
    '''Split the data in Train and
     test using the Shuffle split'''

    rs = ShuffleSplit(n_splits=splits, test_size=size)

    rs.get_n_splits(X)

    for train_index, test_index in rs.split(X, y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return X_train, X_test, y_train, y_test
예제 #18
0
def plot_learning_performance(regressor, X, y):
    """
    Draw a graph that visualizes the learning curves of the model for both
     training and testing as the size of the training set is increased. 
     
     Note that the shaded region of a learning curve denotes the uncertainty 
     of that curve (measured as the standard deviation). 

     The model is scored on both the training and testing sets using R2, the coefficient of determination.
    """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    cv.get_n_splits(X)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int)

    # Calculate the training and testing scores
    sizes, train_scores, test_scores = learning_curve(regressor, X, y, \
        cv = cv, train_sizes = train_sizes, scoring = 'r2')

    # Find the mean and standard deviation for smoothing
    train_std = np.std(train_scores, axis=1)
    train_mean = np.mean(train_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)

    from matplotlib.pyplot import figure
    figure(num=None, figsize=(8, 5), dpi=80, facecolor='w', edgecolor='k')

    plt.title('')
    plt.xlabel('Number of Training Points')
    plt.ylabel('r2 score')
    plt.xlim([0, X.shape[0] * 0.8])
    plt.ylim([-0.05, 1.05])

    plt.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
    plt.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
    plt.fill_between(sizes, train_mean - train_std, \
        train_mean + train_std, alpha = 0.15, color = 'r')
    plt.fill_between(sizes, test_mean - test_std, \
        test_mean + test_std, alpha = 0.15, color = 'g')

    # Visual aesthetics
    plt.legend(bbox_to_anchor=(0.4, 1.3), loc='lower left', borderaxespad=0.)

    plt.suptitle(type(regressor).__name__ + ' Learning Performances',
                 fontsize=16,
                 y=1.03)
    plt.show()
def doExp(datasetPath,
          epsilon,
          varianceRatio,
          numOfRounds,
          numOfPointsinXAxis,
          isLinearSVM=True):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath)
    else:
        data = np.loadtxt(datasetPath, delimiter=",")
    numOfFeature = data.shape[1] - 1
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data[:, 1:])
    globalPCA = PCAImpl(data_std)
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)

    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    cprResult = None

    #rs = StratifiedShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0);
    #rs.get_n_splits(data[:,1:],data[:,0]);
    rs = ShuffleSplit(n_splits=numOfRounds, test_size=.2, random_state=0)
    rs.get_n_splits(data)
    for train_index, test_index in rs.split(data):
        #for train_index, test_index in rs.split(data[:,1:],data[:,0]):

        trainingData = data[train_index]
        testingData = data[test_index]
        print "number of training samples %d" % trainingData.shape[0]
        #tmpResult = p.apply_async(singleExp, (xDimensions,trainingData,testingData,topK,isLinearSVM));
        #cprResult += tmpResult.get();
        mostSamplesPerDataOwner = trainingData.shape[0] / 2
        xSamples = np.arange(
            2, mostSamplesPerDataOwner,
            max(mostSamplesPerDataOwner / numOfPointsinXAxis, 1))
        print "number of samples be tested: %s" % xSamples
        tmpResult = singleExp(xSamples, trainingData, testingData,
                              largestReducedFeature, epsilon, isLinearSVM)
        if cprResult is None:
            cprResult = tmpResult
        else:
            cprResult = np.concatenate((cprResult, tmpResult), axis=0)

    for result in cprResult:
        print ','.join(['%.3f' % num for num in result])

    return cprResult
예제 #20
0
def bootstrap_runner(run_name):
    selected_efps = pd.read_csv(
        path / "results" / run_name / "selected_efps.csv"
    )
    selected_efps = selected_efps.efp.tolist()[1:]
    X, y = grab_and_mix_data(selected_efps)

    n = len(y)
    n_train = int(0.85 * n)
    n_test = int(0.15 * n)
    rs = ShuffleSplit(n_splits=n_splits, random_state=0, test_size=0.15)
    rs.get_n_splits(X)

    ShuffleSplit(n_splits=n_splits, random_state=0, test_size=0.15)
    straps = []
    aucs = []
    bs_count = 0
    for train_index, test_index in rs.split(X):
        X_train = X[train_index]
        y_train = y[train_index]
        X_val = X[test_index]
        y_val = y[test_index]
        model_file = f"{bs_model_dir}/bs-{bs_count}.h5"
        if not os.path.isfile(model_file):
            model = nn(
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,
                y_val=y_val,
                epochs=epochs,
                batch_size=batch_size,
                layers=layers,
                nodes=nodes,
                model_file=model_file,
                verbose=0,
            )
        else:
            model = tf.keras.models.load_model(model_file)

        auc_val = roc_auc_score(y_val, np.hstack(model.predict(X_val)))
        # print(f"    test-set AUC: {auc_val:.5}")
        straps.append(bs_count)
        aucs.append(auc_val)
        results = pd.DataFrame({"bs": straps, "auc": aucs})
        results.to_csv(path / "results" / run_name / "bootstrap_results.csv")
        bs_count += 1
        auc_mean = np.average(aucs)
        auc_std = np.std(aucs)
        print(f"AUC = {auc_mean:.5f} +/- {auc_std:.5f}")
예제 #21
0
def ModelComplexity(X, y):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    cv.get_n_splits(X)

    # Vary the max_depth parameter from 1 to 10
    max_depth = np.arange(1, 11)

    # Calculate the training and testing scores
    train_scores, test_scores = validation_curve(DecisionTreeRegressor(),
                                                 X,
                                                 y,
                                                 param_name="max_depth",
                                                 param_range=max_depth,
                                                 cv=cv,
                                                 scoring='r2')

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score')
    pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score')
    pl.fill_between(max_depth,
                    train_mean - train_std,
                    train_mean + train_std,
                    alpha=0.15,
                    color='r')
    pl.fill_between(max_depth,
                    test_mean - test_std,
                    test_mean + test_std,
                    alpha=0.15,
                    color='g')

    # Visual aesthetics
    pl.legend(loc='lower right')
    pl.xlabel('Maximum Depth')
    pl.ylabel('Score')
    pl.ylim([-0.05, 1.05])
    pl.show()
예제 #22
0
def doExp(datasetPath,varianceRatio,numOfRounds):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath);
    else:
        data = np.loadtxt(datasetPath, delimiter=",");

    rs = ShuffleSplit(n_splits=numOfRounds, test_size=2, random_state=0);
    rs.get_n_splits(data);
    globalPCA = PCAImpl(data[:, 1:]);
    numOfFeature = data.shape[1] - 1;
    matrixRank = LA.matrix_rank(data[:, 1:]);

    print "Matrix rank of the data is %d." % matrixRank;
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(varianceRatio);
    print "%d/%d dimensions captures %.2f variance." % (largestReducedFeature, numOfFeature, varianceRatio);

    xEpsilons = np.arange(0.1, 1.1, 0.1);
    # print xDimensions;
    # p = Pool(numOfRounds);
    # allResults = [];
    cprResult = [];
    m = 0;
    for train_index, test_index in rs.split(data):
        print "Trail %d" % m;
        trainingData = data[train_index];
        pureTrainingData = trainingData[:, 1:];
        tmpResult = singleExp(xEpsilons, pureTrainingData, largestReducedFeature);
        cprResult.extend(tmpResult);
        m += 1;
        # print tmpResult.shape;
        # print tmpResult;
        # tmpResult = p.apply_async(singleExp, (xEpsilons,pureTrainingData,largestReducedFeature));
        # cprResult += tmpResult.get();
    """
        for i in range(0,len(cprResult)):
            print "%.4f,%.4f,%.4f" % (cprResult[i][0],cprResult[i][1],cprResult[i][2]);
        print "******************************";
    """
    # Compute the average value after numOfRounds experiments.
    # avgCprResult = cprResult/numOfRounds;
    # p.close();
    # p.join();
    for result in cprResult:
        print ','.join(['%.3f' % num for num in result]);

    return np.asarray(cprResult, dtype=float);
예제 #23
0
def run_SVM(x, y):
    '''
    run cross validated SVM regression
    :param x: feature vectors
    :param y: labels
    :return: None
    '''
    print 'SVM: '
    rs = ShuffleSplit(n_splits=5, test_size=.20)
    rs.get_n_splits(x)
    split = 0
    for train_index, test_index in rs.split(x):
        print "split", split
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_svm(x_train, x_test, y_train, y_test)
        split += 1
예제 #24
0
def trainTestSplit(x_data, y_data):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    rs = ShuffleSplit(n_splits=1,
                      train_size=0.7,
                      test_size=0.3,
                      random_state=0)
    rs.get_n_splits(x_data)

    for train_index, test_index in rs.split(x_data, y_data):

        X_train, X_test = x_data[train_index], x_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]

    return X_train, y_train, X_test, y_test
예제 #25
0
def run_LR(x, y):
    '''
    run cross validated logistic regression
    :param x: feature vectors
    :param y: labels
    :return: None
    '''
    rs = ShuffleSplit(n_splits=5, test_size=.20)
    rs.get_n_splits(x)
    print 'Logistic Regression: '
    split = 0
    for train_index, test_index in rs.split(x):
        print "split", split
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        acc, per, recall = run_logreg(x_train, x_test, y_train, y_test)
        split += 1
    return
def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2):
	#TODO: First get the train indices and test indices for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the iterations
	sskf = ShuffleSplit(n_splits=iterNo, test_size=test_percent, random_state=RANDOM_STATE)
	sskf.get_n_splits(X)
	accuracies = []
	aucs = []

	for train_index, test_index in sskf.split(X):
		Y_pred = models_partc.logistic_regression_pred(X[train_index], Y[train_index], X[test_index])
		#Y_pred = my_model.my_classifier_predictions(X[train_index],Y[train_index],X[test_index])
		accuracy = accuracy_score(Y_pred, Y[test_index])
		auc_score = roc_auc_score(sorted(Y_pred), sorted(Y[test_index]))
		accuracies.append(accuracy)
		aucs.append(auc_score)

	return np.mean(accuracies), np.mean(aucs)
예제 #27
0
def fit_model(X, y):
    rs = ShuffleSplit(n_splits=10, test_size=0.20, random_state=0)
    cv_sets = rs.get_n_splits(X)
    classifier = DecisionTreeClassifier(random_state=0)
    params = {"max_depth": range(1, 11)}
    scoring_fnc = make_scorer(performance_metric)
    grid = GridSearchCV(classifier, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(X, y)
    print(pd.DataFrame(grid.cv_results_))
    return grid.best_estimator_
예제 #28
0
def step_first_train(classfiers_dict):

    ## 将数据分成1份训练
    X, y = load_training_data()

    rs = ShuffleSplit(n_splits=1, test_size=.1, random_state=0)
    rs.get_n_splits(X)
    x_train = []
    y_train = []
    X_test = []
    y_test = []
    for train_index, test_index in rs.split(X):
        for i in train_index:
            x_train.append(X[i])
            y_train.append(y[i])
        for i in test_index:
            X_test.append(X[i])
            y_test.append(y[i])

    step1_X = x_train
    step1_y = y_train

    ## 对每个类别作概率预测
    new_X = []
    for i in range(0, step1_X.__len__(), 1):
        pro_X = [0 * j for j in range(0, 24, 1)]
        for label in labelSpace_dict:
            label_index = labelSpace_dict[label]
            pro_total = 0.0
            logre_classifier = classfiers_dict[label]
            for t in range(0, logre_classifier.__len__(), 1):
                clf = logre_classifier[t]
                # print step1_X[i]
                pro = clf.predict_proba(step1_X[i])
                # print pro[0][1]
                pro_total += pro[0][1]
            pro_total = pro_total / logre_classifier.__len__()
            pro_X[label_index] = pro_total
        # print pro_X
        new_X.append(pro_X)

    return new_X, step1_y
예제 #29
0
def test():
    filename=input("输入数据所在的文本文件的路径:")
    X,y=fileload(filename)
    X=norm(X)
    rs=ShuffleSplit(n_splits=3,test_size=.3,random_state=0)  #划分测试集,训练集
    rs.get_n_splits(X)
    for train_index,test_index in rs.split(X,y):    
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
    group,labels=X_train,y_train
    m=np.shape(X_test)[0]
    result=[]
    for i in range(m):
        result.append(classify0(X_test[i],group,labels,50))
    error=0
    print("分类错误样本:")
    for i in range(45):
        if result[i]!=y_test[i]:
            error+=1
            print(X_test[i])
    print("错误率:")
    return error/m
예제 #30
0
def load_BMET_data(for_seq2emo=True, load_split=False):
    EMOS = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'thankfulness']
    EMOS_DIC = {}
    for idx, emo in enumerate(EMOS):
        EMOS_DIC[emo] = idx
    # data_pata = 'data/EmoSet_RemoveDup_GloveProcess_OneEmo.csv'
    data_pata = 'data/BMETv0.3.csv'
    df_data = pd.read_csv(data_pata)

    # extract the subset which only contains the full sentences.
    source = []
    target = []
    for index, row in df_data.iterrows():
        next_token = str(row['text']).strip().split()
        if len(next_token) > MAX_LEN_DATA:
            next_token = next_token[:MAX_LEN_DATA]
        source.append(' '.join(next_token))
        if for_seq2emo:
            a_target = [0, 2, 4, 6, 8, 10]
            label = row['label'].split()
            for emo in label:
                a_target[EMOS_DIC[emo]] = EMOS_DIC[emo] * 2 + 1
        else:
            a_target = [0] * len(EMOS)
            label = row['label'].split()
            for emo in label:
                a_target[EMOS_DIC[emo]] = 1
        target.append(a_target)
    if not load_split:
        return source, target, EMOS, EMOS_DIC, 'BMETv0.3'
    else:
        from sklearn.model_selection import ShuffleSplit
        X, y = source, target
        ss = ShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
        ss.get_n_splits(X, y)
        train_index, test_index = next(ss.split(y))
        X_train_dev, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train_dev, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
    return X_train_dev, y_train_dev, X_test, y_test, EMOS, EMOS_DIC, 'BMETv0.3'
from sklearn.model_selection import ShuffleSplit
import numpy as np


X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X)
print(rs)
ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
                     random_state=0)

for train_index, test_index in rs.split(X):
    print("# TRAIN:", train_index, "TEST:", test_index)