Пример #1
0
 def test_figure_k_acurracy(self):
     """"""
     #加载数据
     dataset = load_files('./test_file2')
     #对数据进行分词处理
     datasets = []
     for i in dataset.data:
         datasets.append(' '.join([j for j in jieba.cut(i)]))  #训练数据
     cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
     ks = [
         1, 2, 3, 5, 10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000,
         5000, 10000, 20000, 'all'
     ]
     accuracys = []
     for k in ks:
         classifier = bayesClassifier(MultinomialNB, k=k)
         clf = make_pipeline(classifier)
         accuracys.append(\
             average(cross_val_score(clf, datasets, dataset.target, cv=cv)))
     fig, ax = plt.subplots()
     ax.scatter(range(len(ks)), accuracys)
     ax.set_xlabel('k')
     ax.set_ylabel('accuracy')
     plt.show()
     print 'test_figure_k_acurracy done!'
     print '-' * 70
Пример #2
0
def processData(data):
    trainNum=int(len(data)*0.9)
    datasets=[]
    features=[]
    for row in data:
        datasets.append(row[1:])
        features.append(row[0])
    return datasets[:trainNum],features[:trainNum],datasets[trainNum:],features[trainNum:]
Пример #3
0
def processData(data):
    trainNum = int(len(data) * 0.9)
    datasets = []
    features = []
    for row in data:
        datasets.append(row[1:])
        features.append(row[0])
    return datasets[:trainNum], features[:trainNum], datasets[
        trainNum:], features[trainNum:]
Пример #4
0
 def test_pipline_cross_val_score(self):
     """"""
     #加载数据
     dataset = load_files('./test_file2')
     #对数据进行分词处理
     datasets = []
     for i in dataset.data:
         datasets.append(' '.join([j for j in jieba.cut(i)]))  #训练数据
     classifier = bayesClassifier(MultinomialNB, k=1000)
     cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
     clf = make_pipeline(classifier)
     print cross_val_score(clf, datasets, dataset.target, cv=cv)
     print 'test_pipline_cross_val_score done!'
     print '-' * 70
Пример #5
0
def make_train_test_split(X, Y, im_files, explain_files, class_names, explain_interp):
    # split_data = [X_train, X_test, Y_trains, ...]
    split_data = train_test_split(X, Y, im_files, explain_files, explain_interp, test_size=0.2, random_state=0)

    datasets = []
    for dd in range(2):
        dataset = {}
        dataset['X'] = split_data[dd+0]
        dataset['Y'] = split_data[dd+2]
        dataset['im_files'] = split_data[dd+4]
        dataset['explain_files'] = split_data[dd+6]
        dataset['explain_interp'] = split_data[dd+8]
        dataset['class_names'] = class_names
        datasets.append(dataset)

    return datasets[0], datasets[1]
Пример #6
0
def load_adult(*, valid_split=0.25, path="adult.npz", device=None):
    data = np.load(path)

    datasets = []
    for suffix in ("train", "test"):
        # the 13th column is always zero
        X = torch.FloatTensor(np.delete(data[f"x_{suffix}"], 13, 1))
        y = torch.FloatTensor(data[f"y_{suffix}"][:, 0])
        is_protected = torch.ByteTensor(
            np.cast[int](data[f"attr_{suffix}"][:, 0] > 0))
        X = X.to(device)
        y = y.to(device)
        is_protected = is_protected.to(device)
        datasets.append(TensorDataset(X, y, is_protected))
    train, test = datasets
    in_valid = torch.rand(len(train)) < valid_split
    train, valid = TensorDataset(*train[~in_valid]), TensorDataset(
        *train[in_valid])
    return train, valid, test
Пример #7
0
ap.add_argument("-ds", "--dataset", required=True, help="copy.txt")
ap.add_argument("-ts", "--testset", required=True, help="textcopy.txt")
ap.add_argument("-at",
                "--algotype",
                required=True,
                help="dtc/dtr/svm/gnb/erfc/bagc/model")
ap.add_argument("-m",
                "--model",
                required=False,
                help="dtc/dtr/svm/gnb/erfc/bagc")
args = vars(ap.parse_args())

lines_train = open(args["dataset"], 'r').readlines()
datasets = []
for n in lines_train:
    datasets.append(list(map(int, n.split(' '))))

lines_test = open(args["testset"], 'r').readlines()
testset = []
for m in lines_test:
    testset.append(list(map(int, m.split(' '))))

X = datasets[:-1]
target = datasets[-1]
test = testset[:-1]
test_target = testset[-1]

# X = np.c_[(0, 0, 0, 5, 5, 0, 0, 0, 2, 7, 5, 0, 1, 3, 4, 1, 0, 0, 0, 0),
# 		  (0, 0, 0, 5, 3, 0, 0, 0, 1, 13, 3, 0, 0, 2, 5, 1, 0, 0, 0, 0),
# 		  (0, 0, 0, 4, 5, 1, 0, 0, 3, 8, 3, 0, 0, 3, 3, 2, 0, 0, 1, 0),
# 		  (0, 1, 1, 0, 4, 0, 0, 0, 2, 11, 5, 0, 0, 4, 3, 1, 0, 1, 0, 0),
Пример #8
0
def test_correlation_examples(N=500):
    '''
    Python code of correlation examples from beaucronin : https://gist.github.com/beaucronin/2509755
    Python translation of examples http://en.wikipedia.org/wiki/File:Correlation_examples2.svg
    Title: An example of the correlation of x and y for various distributions of (x,y) pairs
    Author: Denis Boigelot
    Parameters
    ----------
    N : the number of samples
    Returns
    -------
    '''

    from numpy.random import (
    uniform as runif,
    multivariate_normal as rmvn,
    normal as rnorm
    )
    from numpy import inner, linspace, array, vstack
    from math import pi, cos, sin, pow, sqrt
    import matplotlib.pyplot as plt

    datasets = []
    MI_scores = []
    for corr in [1., .8, .4, 0., -.4, -.8, -1.]:
        x = rmvn([0., 0.], [[1., corr], [corr, 1.]], N)
        mi = MI_RenyiCC_Multi(x, type="c")
        mi = compute_normMI(x, type="c")
        print("MI score of two continuous linear correlated (correlation degree of %f) gaussian variables : %f" % (corr, mi))
        MI_scores.append(mi)
        datasets.append(x)

    for phi in [0., pi/12., pi/6., pi/4., pi/2. - pi/6., pi/2. - pi/12, pi/2]:
        x = rmvn([0., 0.], [[1., 1.], [1., 1.]], N)
        x = rotate(phi, x)
        mi = MI_RenyiCC_Multi(x, type="c")
        mi = compute_normMI(x, type="c")
        print("MI score of two continuous linear correlated (correlation slope of %f) gaussian variables : %f" % (phi, mi))
        MI_scores.append(mi)
        datasets.append(x)

    a = linspace(-1, 1, N)
    x = array([(x0, 4. * pow(x0 * x0 - .5, 2.) + runif(-1./3., 1./3., 1))
        for x0 in a])
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated (y=4*(x0^2-0.5)^2+c) variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    x = rotate(-pi/8., array([(x0, runif(-1., 1.)) for x0 in a]))
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    x = rotate(-pi/8, x)
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    x = array([(x0, x0 * x0 + runif(-.5, .5)) for x0 in a])
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    signs = [1. if runif() < .5 else -1. for _ in range(N)]
    x = array([(x0, (x0 * x0 + runif(0., .5)) * sign)
        for x0, sign in zip(a, signs)])
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    x = array([(sin(x0 * pi) + rnorm(0., .125), cos(x0 * pi) + rnorm(0., .125))
        for x0 in a])
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    x = vstack((
        rmvn([3., 3], [[1., 0.], [0., 1.]], round(N/4)),
        rmvn([-3., 3], [[1., 0.], [0., 1.]], round(N/4)),
        rmvn([-3., -3], [[1., 0.], [0., 1.]], round(N/4)),
        rmvn([3., -3], [[1., 0.], [0., 1.]], round(N/4))
        ))
    mi = MI_RenyiCC_Multi(x, type="c")
    mi = compute_normMI(x, type="c")
    print("MI score of two continuous non-linear correlated variables : %f" %  mi)
    MI_scores.append(mi)
    datasets.append(x)

    """
    Plot the datasets, mimicking the original plot from Wikipedia.
    """
    plt.figure()
    print("MI_scores:",MI_scores)
    for i in range(len(datasets)):
        plt.subplot(3, 7, i+1)
        x = [a[0] for a in datasets[i]]
        y = [a[1] for a in datasets[i]]
        plt.plot(x, y, '.', markersize=1.)
        plt.title("%.4f" %MI_scores[i])
        # plt.axis('scaled')
        plt.xticks([])
        plt.yticks([])
        ax = plt.gca()
        ax.set_axis_off()
        if i == 14:
            plt.xlim([-1, 1])
            plt.ylim([-1./3., 1.+1./3.])
        elif i == 15:
            z = sqrt(2. + sqrt(2.)) / sqrt(2.)
            plt.xlim([-z, z])
            plt.ylim([-z, z])
        elif i == 16:
            plt.xlim([-sqrt(2.), sqrt(2.)])
            plt.ylim([-sqrt(2.), sqrt(2.)])
        elif i == 17:
            plt.xlim([-1, 1])
            plt.ylim([-.5, 1.5])
        elif i == 18:
            plt.xlim([-1.5, 1.5])
            plt.ylim([-1.5, 1.5])
        elif i == 19:
            plt.xlim([-1.5, 1.5])
            plt.ylim([-1.5, 1.5])
        elif i == 20:
            plt.xlim([-7, 7])
            plt.ylim([-7, 7])
        else:
            plt.xlim([-4, 4])
            plt.ylim([-4, 4])
        ax.set_aspect('equal', adjustable='datalim')
from data.load import df1pos
#dfs, dfp, dfmlle as isomap, spectral1, pca, mlle

from numpy import random
from numpy.random import randint as rndint

datalist = [dfi, dfs, dfp, dfmlle]

datasets = []
for j in datalist:
    names = j.values[:, 0]
    k = (j.values[:, 1:]).astype(float)
    rowz = (k.shape)[0]
    y = (noisy_circles[1])[0:rowz]
    z = (k, y)
    datasets.append(z)

#---------------------------------------------------------

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

clustering_names = [
    'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
    'SpectralClustering', 'Ward', 'AgglomerativeClustering', 'DBSCAN', 'Birch'
]

plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02,
                    right=.98,
                    bottom=.001,
Пример #10
0
    scores['value'].append(metrics[i])
    scores['dataset'].append(dataset)

  scores_['sensitivity'].append(sens)
  scores_['specificity'].append(spec)
  scores_['accuracy'].append(acc)
  scores_['dataset'].append(dataset)
  scores_['algorithm'].append('MLP')
  
  
  return scores, scores_

if __name__ == "__main__": 

  datasets = [('../data/features_1.npy', '../data/target_1.npy'), ('../data/mean_imputation_features_2.npy', '../data/target_1.npy' )]
  datasets.append(('../data/regression_imputation_features_2.npy', '../data/target_1.npy' ))
  resultnames = [ 'dataset1', 'dataset2', 'dataset3' ]

  for i, (dataset, resultname) in enumerate(zip(datasets, resultnames)): 

    features, targets = dataset

    X = np.load(features)
    Y = np.load(targets) 

    n_classes = len(np.unique(Y))
  
    print(n_classes)

    ### Stratify DATA ### 
    skf = StratifiedKFold(n_splits=4)