Пример #1
0
def plot_cluster(kmeansdata, centroid_list, label_list, num_cluster):
    mlab_pca = mlabPCA(kmeansdata)
    cutoff = mlab_pca.fracs[1]
    users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff)
    centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff)
    colors = [(0, 0, 0), (0.33, 1, 0), (1, 0, 0),
              (1, 1, 0)]  #TODO CHANGE TO GENERALIZE FOR MORE CLUSTERS
    # colors = [(0,0,0), (0.33,1,0), (1,0,0), (1,1,0), (0,0.33,1)] #TODO CHANGE TO GENERALIZE FOR MORE CLUSTERS
    # colors = [(0,0,0), (1,0,0), (0,1,0), (0,0,1), (1,1,0), (1,0,1), (0,0,1), (0.33,0,0), (0,0.33,0), (0,0,0.33), (0.33, 1, 0), (0.33,0,1), (0.33,1,1), (1,0.33,0), (1,0.33,1), (0,0.33,1), (0.33,0.33,0.33), (0.33,0.33,0), (0,0.33,0.33), (0.33,0,0.33)]
    plt.figure()
    plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3])
    plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3])
    random_list = random_centroid_selector(num_cluster, 50)
    for i, position in enumerate(centroids_2d):
        if i in random_list:
            plt.scatter(centroids_2d[i, 0],
                        centroids_2d[i, 1],
                        marker='o',
                        c=colors[i],
                        s=100)
    for i, position in enumerate(label_list):
        if position in random_list:
            plt.scatter(users_2d[i, 0],
                        users_2d[i, 1],
                        marker='+',
                        c=colors[position])
    filename = "H-clustering_2D_4"
    i = 0
    while True:
        if os.path.isfile(filename + str(i) + ".png") == False:
            plt.savefig(filename + str(i) + ".png")
            break
        else:
            i = i + 1
    return
Пример #2
0
    def Q1(self):

        # part one
        class1 = np.random.multivariate_normal(self.m1, self.cov, 1000).T
        class2 = np.random.multivariate_normal(self.m2, self.cov, 1000).T
        plt.plot(class1[0,:], class1[1,:], 'x')
        plt.plot(class2[0,:], class2[1,:], 'x')



        # part two : calculate pca
        samples = np.concatenate((class1, class2), axis=1)

        mlab_pca = mlabPCA(samples.T)
        plt.figure(2)
        plt.plot(mlab_pca.Y[0:1000, 0], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
        plt.plot(mlab_pca.Y[1000:2000, 0], '^', markersize=7, color='yellow', alpha=0.5, label='class2')


        # part three
        plt.figure(1)
        sklearn_pca = sklearnPCA(n_components=1)
        sklearn_transf = sklearn_pca.fit_transform(samples.T)
        p = sklearn_pca.inverse_transform(sklearn_transf)
        plt.figure(1)
        plt.plot(p[0:1000, 0], p[0:1000, 1], 'x')
        plt.plot(p[1000:2000, 0], p[1000:2000, 1], 'x')

        error = ((p - samples.T) ** 2).mean()
        print((error))
        print (np.math.sqrt (error))

        plt.show()
Пример #3
0
def split_pca(combined_data, label_1, label_2):

    mlab_pca = mlabPCA(combined_data)

    print(
        'PC axes in terms of the measurement axes scaled by the standard deviations:\n',
        mlab_pca.Wt)

    plt.plot(mlab_pca.Y[0:100, 0],
             mlab_pca.Y[0:100, 1],
             'o',
             markersize=7,
             color='blue',
             alpha=0.5,
             label=label_1)
    plt.plot(mlab_pca.Y[100:200, 0],
             mlab_pca.Y[100:200, 1],
             '^',
             markersize=7,
             color='red',
             alpha=0.5,
             label=label_2)

    plt.xlabel('x_values')
    plt.ylabel('y_values')
    plt.xlim([-4, 4])
    plt.ylim([-4, 4])
    plt.legend()
    #plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')

    plt.show()

    return mlab_pca.Y
Пример #4
0
def do_pca(data, class_label):

    mlab_pca = mlabPCA(wall13_data)

    print(
        'PC axes in terms of the measurement axes scaled by the standard deviations:\n',
        mlab_pca.Wt)

    # pca
    plt.plot(mlab_pca.Y[:, 0],
             mlab_pca.Y[:, 1],
             'o',
             markersize=7,
             color='blue',
             alpha=0.5,
             label=class_label)
    # original
    plt.plot(wall13_data[:, 0],
             wall13_data[:, 1],
             '^',
             markersize=7,
             color='red',
             alpha=0.5,
             label='original')

    plt.xlabel('x_values')
    plt.ylabel('y_values')
    plt.xlim([-4, 40])
    plt.ylim([-4, 10])
    plt.legend()
    plt.title('Transformed samples versus original data')

    plt.show()

    return mlab_pca.Y
Пример #5
0
    def PCA(self):
        if len(self.max_prods) > 0:
            self.np_prods = np.asarray(self.max_prods)
            self.np_dests = np.asarray(self.max_dests)
            self.np_BUs = np.asarray(self.max_BUs)

            self.data_mat = np.column_stack((self.np_prods, self.np_dests, self.np_BUs))
            self.pca_mat = mlabPCA(self.data_mat)   # PCA matrix
Пример #6
0
def plot_pca_top(data,scores,savename='PCA'):
    pca = mlabPCA(data)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    cNorm  = plt.matplotlib.colors.Normalize(vmin=np.min(scores), vmax=np.max(scores))
    sc = ax.scatter(pca.Y[:,0],pca.Y[:,1],pca.Y[:,2],c=scores,cmap=cm,norm=cNorm)
    plt.colorbar(sc)
    plt.savefig('%s.png' % savename,dpi=300)
    plt.show()
Пример #7
0
    def test_pca_iris(self):
        # load the iris dataset
        iris = np.loadtxt(dataDir + "iris.csv",
                          skiprows=1,
                          usecols=(1, 2, 3, 4, 6),
                          delimiter=',')
        # class response
        y = iris[:, -1].astype(int)
        # explanatory vars
        x = iris[:, 0:-1]

        # apply pca
        npca = NudgePCA(x)
        npca.plot_ranks('test_pca.png')

        print("Iris dataset eigen vales:")
        print(npca.eig_vals)
        print("Iris dataset eigen vectors:")
        print(npca.eig_vecs)
        print("Nudge fractioinal varience:")
        print(npca.frac_explained_var)

        # Check against mlabPCA for "validation"
        mpca = mlabPCA(x)
        print("Matplotlib fractioinal varience:")
        print(mpca.fracs)

        # check against expected result
        self.assertAlmostEqual(npca.frac_explained_var[0],
                               mpca.fracs[0],
                               delta=1e-4)
        self.assertAlmostEqual(npca.frac_explained_var[1],
                               mpca.fracs[1],
                               delta=1e-4)
        self.assertAlmostEqual(npca.frac_explained_var[2],
                               mpca.fracs[2],
                               delta=1e-4)
        self.assertAlmostEqual(npca.frac_explained_var[3],
                               mpca.fracs[3],
                               delta=1e-4)

        # check projecting matrix
        w = npca.pcw()
        print("Projection matrix:")
        print(w)

        # project original data onto pca principal axes
        x_transformed = npca.project(retain_frac_var=0.95)

        # plot transformed data
        plt.figure()
        plt.scatter(x_transformed[:, 0], x_transformed[:, 1])
        plt.savefig('project_test.png')
        plt.xlabel('pc 1')
        plt.ylabel('pc 2')
        plt.close()
Пример #8
0
def plot_cluster(kmeansdata, centroid_list, label_list, num_cluster):
    """
			    Function to convert the n-dimensional cluster to 
			    2-dimensional cluster and plotting 50 random clusters
			    file%d.png    -> file where the output is stored indexed
					     by first available file index
					     e.g. file1.png , file2.png ...
			    """
    mlab_pca = mlabPCA(kmeansdata)
    cutoff = mlab_pca.fracs[1]
    users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff)
    centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff)

    colors = get_colors(num_cluster)

    plt.figure()
    plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3])
    plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3])

    # Plotting 50 clusters only for now
    random_list = random_centroid_selector(num_cluster, 50)

    # Plotting only the centroids which were randomly_selected
    # Centroids are represented as a large 'o' marker
    for i, position in enumerate(centroids_2d):
        if i in random_list:
            plt.scatter(centroids_2d[i, 0],
                        centroids_2d[i, 1],
                        marker='o',
                        c=colors[i],
                        s=100)

    for i, position in enumerate(label_list):
        if i in label_list:
            plt.text(centroids_2d[i, 0],
                     centroids_2d[i, 1],
                     str(i),
                     color="red",
                     fontsize=20)

    # Plotting only the points whose centers were plotted
    # Points are represented as a small '+' marker
    for i, position in enumerate(label_list):
        if position in random_list:
            plt.scatter(users_2d[i, 0],
                        users_2d[i, 1],
                        marker='+',
                        c=colors[position])

    filename = "resultat"
    i = 0

    plt.savefig(filename + ".png")

    return
Пример #9
0
    def run(self):
        mlab_pca = mlabPCA(self.feature_matrix)

        project_matrix = mlab_pca.Wt
        project_means = np.matmul(self.cluster_means, project_matrix)

        # collect userIdices for each cluster
        cluster_users = {}
        for userIdx, clusterIdx in self.cluster_labels.items():
            if clusterIdx not in cluster_users:
                cluster_users[clusterIdx] = []
            cluster_users[clusterIdx].append(userIdx)

        colors = ['b', 'c', 'g', 'k', 'm', 'r', 'y']
        dots = [
            '.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's'
            'p', '*', 'h', 'H', 'd', '|', '_', '+', 'x'
        ]

        dot_count = 0
        color_count = 0
        cluster_plot_conf = {}
        for clusterIdx in set(self.cluster_labels.values()):
            cluster_plot_conf[clusterIdx] = [
                dots[dot_count], colors[color_count]
            ]
            color_count += 1
            if color_count == len(colors):
                dot_count += 1
                color_count = 0

        # draw plot
        # plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
        ax = plt.subplot(111, projection='3d')

        for clusterIdx, userIdices in cluster_users.items():
            for userIdx in userIdices:
                ax.scatter(mlab_pca.Y[userIdx, 0],
                           mlab_pca.Y[userIdx, 1],
                           mlab_pca.Y[userIdx, 2],
                           cluster_plot_conf[clusterIdx][0],
                           color=cluster_plot_conf[clusterIdx][1])

        ax.scatter(project_means[:, 0],
                   project_means[:, 1],
                   project_means[:, 2],
                   'x',
                   color='r')

        ax.set_zlabel('Z')  # 坐标轴
        ax.set_ylabel('Y')
        ax.set_xlabel('X')

        plt.show()
Пример #10
0
def plot_cluster(kmeansdata, centroid_list, label_list, num_cluster, title,
                 prefix):
    """
    Function to convert the n-dimensional cluster to 
    2-dimensional cluster and plotting 50 random clusters
    file%d.png    -> file where the output is stored indexed
                     by first available file index
                     e.g. file1.png , file2.png ...
    """
    mlab_pca = mlabPCA(kmeansdata)
    cutoff = mlab_pca.fracs[1]
    users_2d = mlab_pca.project(kmeansdata, minfrac=cutoff)
    centroids_2d = mlab_pca.project(centroid_list, minfrac=cutoff)

    colors = get_colors(num_cluster)
    plt.title(title)
    plt.figure()
    plt.xlim([users_2d[:, 0].min() - 3, users_2d[:, 0].max() + 3])
    plt.ylim([users_2d[:, 1].min() - 3, users_2d[:, 1].max() + 3])

    # Plotting 50 clusters only for now
    random_list = random_centroid_selector(num_cluster, 50)

    # Plotting only the centroids which were randomly_selected
    # Centroids are represented as a large 'o' marker
    for i, position in enumerate(centroids_2d):
        if i in random_list:
            plt.scatter(centroids_2d[i, 0],
                        centroids_2d[i, 1],
                        marker='o',
                        c=colors[i],
                        s=100)

    # Plotting only the points whose centers were plotted
    # Points are represented as a small '+' marker
    for i, position in enumerate(label_list):
        if position in random_list:
            plt.scatter(users_2d[i, 0],
                        users_2d[i, 1],
                        marker='+',
                        c=colors[position])

    filename = 'images/' + prefix
    i = 0
    while True:
        if os.path.isfile(filename + str(i) + ".png") == False:
            #new index found write file and return
            plt.savefig(filename + str(i) + ".png")
            break
        else:
            #Changing index to next number
            i = i + 1
    return
Пример #11
0
def PCA_module(training_data, testing_data):
    #    from matplotlib.mlab import PCA as mlabPCA
    #    import numpy as np
    #    import time
    tstart = time.time()
    mlab_pca = mlabPCA(training_data)
    #    scores=mlab_pca.Y
    loadings = mlab_pca.Wt
    training_mean = np.mean(training_data, axis=0)
    training_std = np.std(training_data, axis=0)

    normalized_testing = (testing_data - training_mean) / training_std
    print('PCA TIME: %.2f secs' % (time.time() - tstart))
    return np.dot(normalized_testing, loadings)
Пример #12
0
def get_spike_feature_pca(channel):
    pca_matrix = []
    for spike in channel.all_spikes:
        row = []
        row.append(spike.spike_max)
        row.append(spike.spike_positive_slope)
        row.append(spike.spike_half_peak_width_negative)
        pca_matrix.append(row)
    pca_matrix = np.array(pca_matrix)
    #print(len(pca_matrix))
    #print(len(pca_matrix[0]))
    spike_feature_pca = mlabPCA(pca_matrix)

    return spike_feature_pca
Пример #13
0
def pca():
    json = request.get_json()
    lists = []
    for row in json:
        aa = [row["Open"], row["Close"], row["Change"], row["Volume"]]
        #print aa
        lists.append(aa)
    a = np.array(lists).astype(np.float)

    # sklearn_pca = sklearnPCA(n_components=4)
    # b= sklearn_pca.fit_transform(a).tolist()
    mlab_pca = mlabPCA(a)
    b = mlab_pca.Y.tolist()
    #print len(b[0])
    return jsonify(result=b)
Пример #14
0
def pca():
    json = request.get_json()
    lists=[]
    for row in json:
        aa=[row["Open"],row["Close"],row["Change"],row["Volume"]]
        #print aa
        lists.append(aa)
    a= np.array(lists).astype(np.float)
    
    # sklearn_pca = sklearnPCA(n_components=4)
    # b= sklearn_pca.fit_transform(a).tolist()
    mlab_pca  = mlabPCA(a)
    b=mlab_pca.Y.tolist()
    #print len(b[0])
    return jsonify(result=b)
Пример #15
0
    def test_pca_iris(self):
        # load the iris dataset
        iris = np.loadtxt(dataDir + "iris.csv", skiprows=1, usecols=(1,2,3,4,6), delimiter=',')
        # class response
        y = iris[:, -1].astype(int)
        # explanatory vars
        x = iris[:, 0:-1]

        # apply pca
        npca = NudgePCA(x)
        npca.plot_ranks('test_pca.png')

        print("Iris dataset eigen vales:")
        print(npca.eig_vals)
        print("Iris dataset eigen vectors:")
        print(npca.eig_vecs)
        print("Nudge fractioinal varience:")
        print(npca.frac_explained_var)

        # Check against mlabPCA for "validation"
        mpca = mlabPCA(x)
        print("Matplotlib fractioinal varience:")
        print(mpca.fracs)

        # check against expected result
        self.assertAlmostEqual(npca.frac_explained_var[0], mpca.fracs[0], delta=1e-4)
        self.assertAlmostEqual(npca.frac_explained_var[1], mpca.fracs[1], delta=1e-4)
        self.assertAlmostEqual(npca.frac_explained_var[2], mpca.fracs[2], delta=1e-4)
        self.assertAlmostEqual(npca.frac_explained_var[3], mpca.fracs[3], delta=1e-4)

        # check projecting matrix
        w = npca.pcw()
        print("Projection matrix:")
        print(w)

        # project original data onto pca principal axes
        x_transformed = npca.project(retain_frac_var=0.95)

        # plot transformed data
        plt.figure()
        plt.scatter(x_transformed[:, 0], x_transformed[:, 1])
        plt.savefig('project_test.png')
        plt.xlabel('pc 1')
        plt.ylabel('pc 2')
        plt.close()
Пример #16
0
def split_pca(combined_data, label_1, label_2):

    mlab_pca = mlabPCA(combined_data)

    print("PC axes in terms of the measurement axes scaled by the standard deviations:\n", mlab_pca.Wt)

    plt.plot(mlab_pca.Y[0:100, 0], mlab_pca.Y[0:100, 1], "o", markersize=7, color="blue", alpha=0.5, label=label_1)
    plt.plot(mlab_pca.Y[100:200, 0], mlab_pca.Y[100:200, 1], "^", markersize=7, color="red", alpha=0.5, label=label_2)

    plt.xlabel("x_values")
    plt.ylabel("y_values")
    plt.xlim([-4, 4])
    plt.ylim([-4, 4])
    plt.legend()
    # plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')

    plt.show()

    return mlab_pca.Y
Пример #17
0
def do_pca(data, class_label):

    mlab_pca = mlabPCA(wall13_data)

    print("PC axes in terms of the measurement axes scaled by the standard deviations:\n", mlab_pca.Wt)

    # pca
    plt.plot(mlab_pca.Y[:, 0], mlab_pca.Y[:, 1], "o", markersize=7, color="blue", alpha=0.5, label=class_label)
    # original
    plt.plot(wall13_data[:, 0], wall13_data[:, 1], "^", markersize=7, color="red", alpha=0.5, label="original")

    plt.xlabel("x_values")
    plt.ylabel("y_values")
    plt.xlim([-4, 40])
    plt.ylim([-4, 10])
    plt.legend()
    plt.title("Transformed samples versus original data")

    plt.show()

    return mlab_pca.Y
Пример #18
0
import numpy as np
#pylab inline
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
from matplotlib.mlab import PCA as mlabPCA

arr = np.array([[0, 0, 1, 2, 2, 0, 0], [0, 0, 4, 5, 6, 0, 0],
                [1, 2, 0, 0, 0, 0, 7]])
#assume each point is seven dimensional and we have 3 points
print(arr.shape)
mlab_pca = mlabPCA(arr.T)
print(mlab_pca.Y)
Пример #19
0
with open('names.txt', 'r') as f:
    names = [line.rstrip() for line in f]

#name1 = 'MDP'
#name2 = 'YHOO'
for i in range(-400, 0, 1):
    print(i)
    data = []
    for name in names:
        x1, x2, x3, x4, x5, x6 = np.genfromtxt('db/' + name + '-TS-full.dat',
                                               comments="#",
                                               unpack=True,
                                               usecols=(7, 8, 11, 12, 13, 14))
        data.append([x1[i], x2[i], x3[i], x4[i], x5[i], x6[i]])
    data = np.array(data).T
    pca1 = mlabPCA(data.T)

    plt.plot(pca1.Y[:, 0],
             pca1.Y[:, 1],
             'o',
             markersize=7,
             color='blue',
             alpha=0.5)
    plt.xlim(-30, 30)
    plt.ylim(-15, 15)
    plt.savefig('stockEvol' + str(100 + i).zfill(4) + '.png')
    plt.close()

os.system("convert -delay 15 stockEvol*.png stockEvolMovie.gif")
os.system("rm stockEvol*.png")
'''
Пример #20
0
    for i in range(len(Y)):
        #s=fin.readline()
        print(Y[i][0],
              ',',
              Y[i][1],
              ',',
              Y[i][2],
              ',',
              Y[i][3],
              ',',
              Y[i][4],
              ',',
              Y[i][5],
              ',',
              file=f)
    f.close()


#read data from a CSV file, you can choose different delimiters
att = [
    'teaching', 'international', 'research', 'citation', 'income',
    'cost_of_living'
]
data = pd.io.parsers.read_csv('rankings.csv', header=None)
data.columns = att
# print(data.head())
d = data.values  #we exclude the first column

d_pca = mlabPCA(d)
generateFile(att, d_pca.Y, 'rankings.csv')
Пример #21
0
def get_PCA(sample_array):
    
    PCA = mlabPCA(sample_array)
    return (PCA.fracs, PCA.Wt)
Пример #22
0
    def run(self, csvResponse, csvRealArt, datazipfilepath):
        print("USFS function called\n")
        #TEST
        #c = csv.writer(open('C:/Users/Lisa/PycharmProjects/HonorsThesis/MYFILE.csv', "wb"))
        #c.writerow(["Name","Address","Telephone","Fax","E-mail","Others"])

        # clear variables and associated memories
        #initialized variables
        #PCNoTOTAL = 0

        # create data files in data folder (string for name)
        dataFiles = "\data"
        outputFolder = "\output"

        #create output files in output folder
        response = csv.reader(open(csvResponse))
        real_art = csv.reader(open(csvRealArt))
        #response = csvResponse
        #real_art = csvRealArt

        response_data = list(response)
        real_art_data = list(real_art)

        for i in range(len(response_data)):
            response_data[i] = float(response_data[i][0])

        for i in range(len(real_art_data)):
            real_art_data[i] = float(real_art_data[i][0])

        #response_data = [float(i) for i in response_data]
        #real_art_data = [float(i) for i in real_art_data]

        response_rowNum = len(response_data)
        real_art_rowNum = len(real_art_data)

        lowClass = -1
        highClass = -1
        #FIX: read csv files "response.csv" and "real_art.csv"

        for row in range(1, response_rowNum):
            if row == 1:
                lowClass = response_data[row]
            elif response_data[row] != lowClass:
                highClass = response_data[row]
                if highClass < lowClass:
                    lowClass = highClass
                break

        #get current path
        currentPath = os.getcwd()
        datazip = zipfile.ZipFile(datazipfilepath, 'r')
        datazip.extractall(currentPath + dataFiles)
        os.chdir(currentPath + dataFiles + "\datazip")
        files = os.listdir()
        for i in range(0, len(files)):
            print(files[i])
            if files[i].endswith('.csv') == False:
                #print("HIT")
                files.pop(i)
        print(files)
        sortedFiles = sorted(files)
        fileNum = len(sortedFiles)

        #create empty list rawFeatureList
        rawFeatureList = numpy.empty(fileNum, dtype=numpy.ndarray)
        #rawFeatureList = numpy.zeros([fileNum, 10000])

        for i in range(0, fileNum):
            data = numpy.genfromtxt(files[i], dtype=float, delimiter=",")
            rawFeatureList[i] = data

        realStatus = 1
        cvStatus = 1

        classifierType = ["lda", "qda", "svm"]

        classifNo = len(classifierType)

        if cvStatus == 0:
            foldNo = 10
            iterationLength = 10
        else:
            foldNo = response_rowNum
            iterationLength = 1

        #QUESTION: why do you set each index to index?
        instanceIndex = numpy.zeros((response_rowNum, 1))
        for i in range(0, instanceIndex.size):
            instanceIndex[i] = i

        if realStatus == 1:
            realInstanceIndex = numpy.zeros((real_art_rowNum, 1))
            for i in range(realInstanceIndex.size):
                realInstanceIndex[i] = i

        # FIX: change accuracyOverall to 2d array
        accuracyOverall = numpy.zeros((classifNo, 1))
        accuracyFirstClass = numpy.zeros((classifNo, 1))
        accuracySecondClass = numpy.zeros((classifNo, 1))
        #bestPCS = numpy.zeros((classifNo, 1))
        bestPCS = [0] * classifNo
        accIncr = [0] * classifNo
        subjMisclassified = numpy.array([classifNo, 1, iterationLength],
                                        dtype=object)

        if cvStatus == 0:
            idx = numpy.zeros(classifNo, 1)

        # QUESTION: should these arrays be array of arrays?
        for i in range(classifNo):
            accuracyOverall[i] = numpy.zeros((1, iterationLength))
            accuracyFirstClass[i] = numpy.zeros((1, iterationLength))
            accuracySecondClass[i] = numpy.zeros((1, iterationLength))
            #bestPCS[i] = numpy.zeros((1, iterationLength))
            bestPCS[i] = [0] * iterationLength
            accIncr[i] = [0] * iterationLength

            if realStatus == 0:
                subjMisclassified[i] = numpy.concatenate(
                    (instanceIndex,
                     numpy.zeros((response_rowNum, iterationLength))),
                    axis=1)

            else:
                subjMisclassified[i] = numpy.concatenate(
                    (realInstanceIndex,
                     numpy.zeros((real_art_rowNum, iterationLength))),
                    axis=1)

            if cvStatus == 0:
                idx[i] = numpy.zeros((1, iterationLength))
                for j in range(iterationLength):
                    idx[i][j] = numpy.zeros((foldNo, 1))

        bestPCIndex = numpy.array([])
        accIncrTracker = numpy.array([])

        for z1 in range(iterationLength):
            print('Iteration ' + str(z1))
            #begin PCA
            featureList = numpy.empty(fileNum, dtype=numpy.ndarray)

            # import from scipy stats
            # FIX
            for i in range(fileNum):
                featureList[i] = stats.zscore(rawFeatureList[i])

            #scoreList = numpy.empty(fileNum, dtype=numpy.ndarray)
            scoreList = [0] * fileNum
            PCNoList = numpy.empty(fileNum, dtype=int)
            coeffList = [0] * fileNum

            os.chdir(currentPath + dataFiles + outputFolder)

            #import:  from matplotlib.mlab import PCA

            for featNum in range(0, fileNum):
                #PCAobject = PCA(featureList[featNum])
                #PCAobject = PCA(n_components=len(featureList[featNum][0, :]), copy=True, whiten=False)
                #X = numpy.matrix('1 30 2 4; 2 50 4 10; 8 20 2 3; 7 70 7 5; 2 10 3 9')
                #PCAobject.fit_transform(featureList[featNum])
                #print("X")
                #print(X)
                PCAobject = mlabPCA(featureList[featNum], standardize=False)
                i = 0
                j = 0

                explained = 100 * PCAobject.fracs  # this is correct
                coeff = PCAobject.Wt.T  #this is correct, except last column has +/- signs switched
                score = PCAobject.Y  #same issue as coeff (but i dont think its significant?)
                print("Coeff is ")
                print(coeff)
                print("Score is:")
                print(score)
                print("Explained is:")
                print(explained)

                #print("featureList[featNum] is ")
                #print(featureList[featNum])

                print("PCA percentages: ")
                k = 0
                while i < len(explained):
                    j = j + explained[i]
                    k = i
                    if j > 85:
                        break
                    i += 1

                scoreList[featNum] = score[:, 0:k + 1]
                coeffList[featNum] = coeff[:, 0:k + 1]
                PCNoList[featNum] = k + 1

                string1 = 'CoeffMatrix' + files[featNum]
                string2 = 'ScoreMatrix' + files[featNum]

                file1 = open(string1, 'wb')
                wr1 = csv.writer(file1, quoting=csv.QUOTE_ALL)
                #wr1.writerows(coeffList[featNum])
                numpy.savetxt(string1, coeffList[featNum], delimiter=",")

                file2 = open(string2, 'wb')
                wr2 = csv.writer(file2, quoting=csv.QUOTE_ALL)
                #wr2.writerows(scoreList[featNum])
                numpy.savetxt(string2, scoreList[featNum], delimiter=",")

            PCNumTOTAL = sum(PCNoList)
            PCNumCum = numpy.cumsum(PCNoList)

            file_PCNumCum = open('PCNumCum.csv', 'wb')
            wr3 = csv.writer(file_PCNumCum, quoting=csv.QUOTE_ALL)
            #wr3.writerows(PCNumCum)
            numpy.savetxt('PCNumCum.csv', PCNumCum, delimiter=",")

            scoreTotal = numpy.zeros((response_rowNum, PCNumTOTAL))
            x = 0

            for i in range(0, fileNum):
                #get shape of scoreList[i]
                numRowsScoreList = len(scoreList[i])
                numColScoreList = len(scoreList[i][0])
                print(numColScoreList)
                scoreTotal[:, x:x + numColScoreList] = scoreList[i]
                x += numColScoreList

            file_PCScoreTotal = open('PCScoreTotal.csv', 'wb')
            wr4 = csv.writer(file_PCScoreTotal, quoting=csv.QUOTE_ALL)
            numpy.savetxt('PCScoreTotal.csv', scoreTotal, delimiter=",")

            #end of PCA
            cvPartition = -1
            if cvStatus == 0:
                #to FIX
                #cvPartition = cvpartition(response, 'KFold', foldNo)
                cvPartition = StratifiedKFold(response,
                                              n_folds=foldNo,
                                              shuffle=False,
                                              random_state=None)
                #cvPartition = StratifiedKFold(response_data, n_folds=foldNo, shuffle=False, random_state=None)
                #numpy.random.shuffle(cvPartition)

            else:
                # to FIX
                #cvPartition = cvpartition(foldNo, 'LeaveOut')

                cvPartition = LeaveOneOut(len(response_data))
                #cvPartition = LeaveOneOut(foldNo)

                #numpy.random.shuffle(cvPartition)

            print("PCNumTOTAL:")
            print(PCNumTOTAL)
            pcIndexNumbers = numpy.zeros((PCNumTOTAL, 1))
            for i in range(0, PCNumTOTAL):
                pcIndexNumbers[i] = i

            for z2 in range(0, classifNo):
                print('Classifier ' + classifierType[z2])
                classifier = classifierType[z2]

                maxAcc = 0
                #scoreBestPCs = numpy.zeros(scoreTotal.shape)
                scoreBestPCs = []
                bestPCIndex = numpy.array([])

                PCNoTOTAL = PCNumTOTAL
                scoreTOTAL = scoreTotal
                pcIndexNo = pcIndexNumbers

                maxAccTracker = numpy.array([0, 100])
                maxAccIndex = 0
                maxAccuracy = 0
                lowClassAccuracy = 0
                highClassAccuracy = 0

                finalInstMisclass = []

                lowClassAccuracies = []
                highClassAccuracies = []
                instMisclass = []

                lt = 1

                while (maxAccTracker[1] - maxAccTracker[0]) > 1:
                    print("in the while loop")
                    if lt > 1:

                        if scoreBestPCs != []:
                            scoreBestPCs = numpy.column_stack(
                                (scoreBestPCs, scoreTOTAL[:, maxAccIndex]))
                        else:
                            scoreBestPCs = scoreTOTAL[:, maxAccIndex]
                        if bestPCIndex.size != 0:
                            bestPCIndex = numpy.append(
                                bestPCIndex, [pcIndexNo[maxAccIndex]])
                            #bestPCIndex = numpy.append((bestPCIndex, pcIndexNo[maxAccIndex]))
                        else:
                            bestPCIndex = pcIndexNo[maxAccIndex]

                        #should be 0?
                        if maxAccIndex == 1:
                            scoreTOTAL = scoreTOTAL[:, 1:PCNoTOTAL]
                            pcIndexNo = pcIndexNo[1:PCNoTOTAL]

                        elif maxAccIndex == PCNoTOTAL - 1:
                            scoreTOTAL = scoreTOTAL[:, 0:PCNoTOTAL - 1]
                            pcIndexNo = pcIndexNo[0:PCNoTOTAL - 1]
                        else:
                            scoreTOTAL = numpy.column_stack(
                                (scoreTOTAL[:, 0:maxAccIndex],
                                 scoreTOTAL[:, maxAccIndex + 1:PCNoTOTAL]))
                            pcIndexNo = numpy.row_stack(
                                (pcIndexNo[0:maxAccIndex],
                                 pcIndexNo[maxAccIndex + 1:PCNoTOTAL]))

                        lowClassAccuracy = lowClassAccuracies[0][maxAccIndex]
                        highClassAccuracy = highClassAccuracies[0][maxAccIndex]

                        finalInstMisclass = instMisclass[maxAccIndex]

                        print("finalInstMisclass:")
                        print(finalInstMisclass)

                        #numpy function for row concatenation
                        if accIncrTracker.size != 0:
                            accIncrTracker = numpy.append(
                                accIncrTracker,
                                [(maxAccTracker[1] - maxAccTracker[0])])
                        else:
                            accIncrTracker = maxAccTracker[1] - maxAccTracker[0]

                        maxAccuracy = maxAcc

                        PCNoTOTAL = PCNoTOTAL - 1
                        #end not checked

                    accuracies = numpy.zeros((1, PCNoTOTAL))
                    lowClassAccuracies = numpy.zeros((1, PCNoTOTAL))
                    highClassAccuracies = numpy.zeros((1, PCNoTOTAL))
                    #instMisclass = numpy.zeros((1, PCNoTOTAL))
                    instMisclass = [0] * PCNoTOTAL

                    for i in range(0, PCNoTOTAL):
                        #numpy function for column concatenation
                        #print(scoreBestPCs.shape)
                        #print(scoreTotal.shape)
                        scoreCandidatePCs = 0
                        if scoreBestPCs != []:
                            scoreCandidatePCs = numpy.column_stack(
                                (scoreBestPCs, scoreTOTAL[:, i]))
                        else:
                            scoreCandidatePCs = numpy.reshape(
                                scoreTotal[:, i], (len(scoreTotal), 1))

                        preAccMatrix = numpy.zeros((len(scoreCandidatePCs), 3))
                        preInstOrder = numpy.zeros((len(scoreCandidatePCs), 1))

                        #x = 0     put in classifierTrainTest
                        #FIX: lines 280-285

                        #for j in range(0, foldNo):        put loop in classifierTrainTest
                        if cvStatus == 0:
                            USFS.classifierTrainTest(
                                scoreCandidatePCs, response_data,
                                real_art_data, cvPartition, classifier,
                                instanceIndex, preAccMatrix, preInstOrder)
                            real_artTEST = dict.get('real_artTEST')
                            instIndexTEST = dict.get('instIndexTEST')
                            trueClassLabel = dict.get('trueClassLabel')
                            predictedClassLabel = dict.get(
                                'predictedClassLabel')
                            #return all of idx[j] to idx[z2][z1]
                            idx[z2][z1][j] = dict.get('idx')

                        else:
                            dict = USFS.classifierTrainTest(
                                scoreCandidatePCs, response_data,
                                real_art_data, cvPartition, classifier,
                                instanceIndex, preAccMatrix, preInstOrder)
                            real_artTEST = dict.get('real_artTEST')
                            instIndexTEST = dict.get('instIndexTEST')
                            trueClassLabel = dict.get('trueClassLabel')
                            predictedClassLabel = dict.get(
                                'predictedClassLabel')

                            subAccMatrix = dict.get('subAccMatrix')
                            preAccMatrix = dict.get('preAccMatrix')
                            preInstOrder = dict.get('preInstOrder')

                            #Added these lines to classifierTrainTest
                            #subAccMatrix = numpy.column_stack(trueClassLabel, predictedClassLabel, real_artTEST)
                            #preAccMatrix[x:x + len(subAccMatrix[:, 0]) - 1, :] = subAccMatrix
                            #preInstOrder[x:x + len(instIndexTEST[:, 0]) - 1] = instIndexTEST

                            #x = x + (subAccMatrix[:, 0].size)

                        if realStatus == 1:
                            accMatrix = numpy.zeros((sum(preAccMatrix[:,
                                                                      2]), 2))
                            instOrder = numpy.zeros((sum(preAccMatrix[:,
                                                                      2]), 1))
                            j = 0
                            for k in range(len(preAccMatrix[:, 2])):
                                if preAccMatrix[k, 2] == 1:
                                    accMatrix[j, 0:2] = preAccMatrix[k, 0:2]
                                    instOrder[j] = preInstOrder[k]
                                    j = j + 1
                        else:
                            accMatrix = preAccMatrix[:, 0:2]
                            instOrder = preInstOrder
                        # FIX: line 313
                        dict2 = USFS.accuracyCalculation(
                            accMatrix, lowClass, instOrder)
                        accuracies[0][i] = dict2.get('accuracy')
                        lowClassAccuracies[0][i] = dict2.get(
                            'lowClassAccuracy')
                        highClassAccuracies[0][i] = dict2.get(
                            'highClassAccuracy')
                        instMisclass[i] = dict2.get('instMisclass')

                        # FIX: line 318

                    maxAccIndex = numpy.argmax(accuracies)
                    maxAcc = numpy.amax(accuracies)

                    if (maxAccTracker[0] == 0) and (maxAccTracker[1] == 100):
                        maxAccTracker = numpy.array([0, maxAcc])
                    else:
                        maxAccTracker[0] = maxAccTracker[1]
                        maxAccTracker[1] = maxAcc

                    if (PCNoTOTAL == 1) and (
                        (maxAccTracker[1] - maxAccTracker[0]) > 1):
                        scoreBestPCs = numpy.column_stack(
                            (scoreBestPCs, scoreTOTAL))
                        bestPCIndex = numpy.hstack((bestPCIndex, pcIndexNo))

                        scoreTOTAL = []
                        pcIndexNo = []

                        lowClassAccuracy = lowClassAccuracies
                        highClassAccuracy = highClassAccuracies

                        finalInstMisclass = instMisclass[maxAccIndex]

                        accIncrTracker = numpy.hstack(
                            (accIncrTracker,
                             maxAccTracker[1] - maxAccTracker[0]))
                        maxAccuracy = maxAcc
                        maxAccTracker = numpy.matrix['0, 0.5']

                    lt += 1

                print("Out of loop")

                print(finalInstMisclass)
                #order = numpy.argsort(finalInstMisclass[:, 0])
                #for i in range(0, len(order)):
                #    finalInstMisclass = finalInstMisclass[order[i], :]
                finalInstMisclass = finalInstMisclass[:, 1]

                # FIX: curly brackets vs parentheses? lines 359-364
                accuracyOverall[z2][z1] = maxAccuracy
                accuracyFirstClass[z2][z1] = lowClassAccuracy
                accuracySecondClass[z2][z1] = highClassAccuracy
                bestPCS[z2][z1] = bestPCIndex
                accIncr[z2][z1] = accIncrTracker
                # FIX: 3d array where you can replace a whole column
                shapeSubjMisclassified = subjMisclassified[z2].shape
                for c in range(0, shapeSubjMisclassified[0]):
                    subjMisclassified[z2][c][1 + z1] = finalInstMisclass[c]
                #################################################

        maxVal = numpy.zeros(classifNo)
        for i in range(0, classifNo):
            for j in range(0, iterationLength):
                bestPCsShape = bestPCS[i][j].shape
                if bestPCsShape[0] > maxVal[i]:
                    maxVal[i] = bestPCsShape[0]

        bestPCsummary = [0] * classifNo

        # FIX: curly brackets vs parentheses? lines 387-391
        for i in range(0, classifNo):
            bestPCsummary[i] = numpy.zeros(
                (3 + maxVal[i], iterationLength * 2))
            bestPCsummary[i][0, 0:iterationLength] = accuracyOverall[i]
            bestPCsummary[i][1, 0:iterationLength] = accuracyFirstClass[i]
            bestPCsummary[i][2, 0:iterationLength] = accuracySecondClass[i]

        for i in range(0, iterationLength):
            for j in range(0, classifNo):
                x = 3
                bestPCsShape = bestPCS[j][i].shape
                for k in range(0, bestPCsShape[0]):
                    bestPCsummary[j][x][i] = bestPCS[j][i][k]
                    bestPCsummary[j][x][i + iterationLength] = accIncr[j][i][k]
                    x += 1

        for i in range(0, classifNo):
            summary_string = 'SummaryBestPCS_' + classifierType[i] + '.csv'
            misclassified_string = 'MisclassifiedSubjects_' + classifierType[
                i] + '.csv'
            file3 = open(summary_string, 'wb')
            file4 = open(misclassified_string, 'wb')
            numpy.savetxt(file3, bestPCsummary[i], delimiter=',')
            numpy.savetxt(file4, subjMisclassified[i], delimiter=',')
            file3.close()
            file4.close()
Пример #23
0
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlim([0, 10])
plt.xlabel('n_components')
plt.ylabel('explained_variance_')

## plotted pca along 2 components

## 1st method through matplotlib
from matplotlib.mlab import PCA as mlabPCA

mlab_pca = mlabPCA(data)

print(
    'PC axes in terms of the measurement axes scaled by the standard deviations:\n',
    mlab_pca.Wt)

plt.plot(mlab_pca.Y[:, 0],
         mlab_pca.Y[:, 1],
         'o',
         markersize=7,
         color='blue',
         alpha=0.5,
         label='class1')

plt.xlabel('x_values')
plt.ylabel('y_values')
Пример #24
0
    # explained_variance_ratio = explained_variance / numpy.sum(explained_variance)
    print("Explained Variance Ratio" + str(explained_variance_ratio))
    # print("RP Score"+ str(result.score(traindata, y= None)))
    # print("RP Score")
    # print pca.score(df)
    



print '!!!!!!!!!!!!'
# # Graphical Representation of n = 3
# Fit the PCA analysis
result = PCA(n_components=3).fit(df)
from matplotlib.mlab import PCA as mlabPCA

mlab_pca = mlabPCA(df)

print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt)

plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(mlab_pca.Y[20:40,0], mlab_pca.Y[20:40,1], '^', markersize=7, color='red', alpha=0.5, label='class2')

plt.xlabel('x_values')
plt.ylabel('y_values')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.legend()
plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')

plt.show()
Пример #25
0
         'o',
         markersize=8,
         color='orange',
         alpha=0.5,
         label='class1')
plt.plot(c2[0, :],
         c2[1, :],
         'o',
         markersize=8,
         alpha=0.5,
         color='green',
         label='class2')
plt.show()

twoClass = np.concatenate((c1, c2), axis=1)
PCA_F = mlabPCA(twoClass.T)
plt.figure(2)
plt.plot(PCA_F.Y[0:1000, 0],
         'o',
         markersize=7,
         color='orange',
         alpha=0.5,
         label='class1')
plt.plot(PCA_F.Y[1000:2000, 0],
         'o',
         markersize=7,
         color='green',
         alpha=0.5,
         label='class2')
plt.show()
Пример #26
0
np.random.seed(123456)  # this can be avoid to use a smaller seed

mu_vec1 = np.array([0, 0, 0])
cov_mat1 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20).T
assert class1_sample.shape == (3, 20)

mu_vec2 = np.array([1, 1, 1])
cov_mat2 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20).T
assert class2_sample.shape == (3, 20)

all_samples = np.concatenate((class1_sample, class2_sample), axis=1)
assert all_samples.shape == (3, 40)

mlab_pca = mlabPCA(all_samples.T)
print('mlab_pca :\n', mlab_pca.Wt)

plt.plot(mlab_pca.Y[0:20, 0],
         mlab_pca.Y[0:20, 1],
         'o',
         markersize=7,
         color='blue',
         alpha=0.5,
         label='class1')
plt.plot(mlab_pca.Y[20:40, 0],
         mlab_pca.Y[20:40, 1],
         'o',
         markersize=7,
         color='red',
         alpha=0.5,
sm.qqplot(comb[4], line='45')

## Principal component analysis

#### Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components.The number of principal components is less than or equal to the number of original variables. This transformation is defined in such a way that the first principal component has the largest possible variance (that is, accounts for as much of the variability in the data as possible), and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to (i.e., uncorrelated with) the preceding components. The principal components are orthogonal because they are the eigenvectors of the covariance matrix, which is symmetric. PCA is sensitive to the relative scaling of the original variables.

##### The main purposes of a principal component analysis are the analysis of data to identify patterns and finding patterns to reduce the dimensions of the dataset with minimal loss of information.

# In[292]:

from matplotlib.mlab import PCA as mlabPCA

# In[295]:

mlab_pca = mlabPCA(train)
mlab_pca

# In[296]:

mlab_pca.Y

# In[298]:

mlab_pca.Y.shape

# In[299]:

PCAY = DataFrame(mlab_pca.Y)

# In[300]:
Пример #28
0
plt.plot(transformed[0,0:20], transformed[1,0:20], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(transformed[0,20:40], transformed[1,20:40], '^', markersize=7, color='red', alpha=0.5, label='class2')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.legend()
plt.title('Transformed samples with class labels')


plt.show()


from matplotlib.mlab import PCA as mlabPCA

mlab_pca = mlabPCA(all_samples.T)

print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt)
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(414)
plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(mlab_pca.Y[20:40,0], mlab_pca.Y[20:40,1], '^', markersize=7, color='red', alpha=0.5, label='class2')

plt.xlabel('x_values')
plt.ylabel('y_values')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.legend()
plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')

plt.show()
Пример #29
0
plt.plot(cluster_range, silhouette_curve, label='Silhouette Curve')
plt.legend()
plt.show()

# now choose optimal clusters and then plot via pca
num_clusters = 17
km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters)
km.fit_predict(small_sample)
clusters = km.labels_.tolist()
inertia = km.inertia_
inertia_curve.append(round(inertia, 4))
cluster_range = range(cluster)[1:]
labels = km.labels_

mlab_pca = mlabPCA(small_sample)

clusters_array = np.array(clusters)
clusters_array_2 = clusters_array.reshape(10000, 1)
d = np.concatenate((mlab_pca.Y, clusters_array_2), axis=1)

fig = plt.figure(figsize=(15, 5))
ax1 = fig.add_subplot(131, projection='3d')  # row-col-num
for num in range(cluster):
    plt.plot(d[d[:, 25] == num][:, 0],
             d[d[:, 25] == num][:, 1],
             d[d[:, 25] == num][:, 2],
             'o',
             markersize=7,
             color=colors[num],
             alpha=0.5)  #, label = labels)
def plotPCA(data,title,showNow,labels):
        fig = plt.figure(title)
        mlab_pca = mlabPCA(data)
        plt.scatter(mlab_pca.Y[:,0],mlab_pca.Y[:,1],c=labels.astype(np.float), alpha=1)
        if(showNow):plt.show()
Пример #31
0
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('live.csv',encoding='gb2312')
print(data.head(5))

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(data.iloc[:,1:8])#iloc:按特定的索引号 [行,列]
print(pca.explained_variance_ratio_)#贡献率

newdata=pca.fit_transform(data.iloc[:,1:8])
print(newdata)

plt.scatter(newdata[:,0],newdata[:,1])
plt.show()

#标准化:减去均值除以标准差

#法2
from matplotlib.mlab import PCA as mlabPCA

live_pcl=mlabPCA(data.iloc[:,1:8],standardize=True)
live_eigenvector=pd.DataFrame(live_pcl.Wt,index=['P1','P2','P3','P4','P5','P6','P7'],columns=data.columns[1:8])#转成df,设定索引
live_eigenvector=live_eigenvector.T
print(live_eigenvector)
Пример #32
0
plt.plot(cluster_range, silhouette_curve, label = 'Silhouette Curve')
plt.legend()
plt.show()

# now choose optimal clusters and then plot via pca
num_clusters = 17
km = MiniBatchKMeans(init='k-means++', n_clusters=num_clusters)
km.fit_predict(small_sample)
clusters = km.labels_.tolist()
inertia = km.inertia_
inertia_curve.append(round(inertia,4))
cluster_range = range(cluster)[1:]
labels = km.labels_

mlab_pca = mlabPCA(small_sample)

clusters_array = np.array(clusters)
clusters_array_2 = clusters_array.reshape(10000,1)
d = np.concatenate((mlab_pca.Y, clusters_array_2), axis=1)

fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(131, projection='3d')  # row-col-num
for num in range(cluster):
    plt.plot(d[d[:,25]==num][:,0],d[d[:,25]==num][:,1],d[d[:,25]==num][:,2],'o', markersize=7, color=colors[num], alpha=0.5)#, label = labels)
    # plt.zlabel('z_values')
plt.title('PCA and k-means clustering, n=10,000 drugs')
plt.xlim([-4,4])
plt.ylim([-4,4])
ax2 = fig.add_subplot(132)  # row-col-num
for num in range(cluster):
Пример #33
0
import numpy as np
from matplotlib.mlab import PCA as mlabPCA
import matplotlib.pyplot as plt
from load_data import read_data
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d


all_samples = read_data("data/train1.csv")

y_train = np.array([x[0] for x in all_samples])
X_train = np.array([x[1:] for x in all_samples])
	
data_array = X_train
mlab_pca = mlabPCA(data_array)

Class0 = [i for i in range(len(y_train)) if y_train[i]==0 ]
Class1 = [i for i in range(len(y_train)) if y_train[i]==1 ]

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.plot(mlab_pca.Y[Class0,0], mlab_pca.Y[Class0,1],mlab_pca.Y[Class0,2], 'o', markersize=8, color='blue', alpha=0.5, label='class1')
ax.plot(mlab_pca.Y[Class1,0], mlab_pca.Y[Class1,1],mlab_pca.Y[Class1,2], '^', markersize=8, alpha=0.5, color='red', label='class2')


#plt.plot(mlab_pca.Y[Class0,0],mlab_pca.Y[Class0,1],mlab_pca.Y[Class0,2] ,'o', markersize=7,color='blue', alpha=0.5, label='class1')
#plt.plot(mlab_pca.Y[Class1,0], mlab_pca.Y[Class1,1],mlab_pca.Y[Class1,2], '^', markersize=7,color='red', alpha=0.5, label='class2')

plt.show()
Пример #34
0
        from sklearn.decomposition import sparse_encode   
        dl = sparse_coding(reducedDimension, dataArray_normalized, 0.2, 1000, 0.0001)
        code = sparse_encode(dataArray_normalized, dl.components_)
        data_reduced = code
        print 'Reduced data:'
        print data_reduced
        print 'Dictionary:'
        print dl.components_    
        print 'iteration:', dl.n_iter_
    elif 'PCA' in args['dimReductionType']:
        ####################################
        #   Principal Component Analysis   #
        ####################################
        from matplotlib.mlab import PCA as mlabPCA
        print 'PCA:'
        myPCA = mlabPCA(dataArray)
        data_reduced = myPCA.Y[:,0:reducedDimension]# reduce to the specified dimension
        print 'Raw data:'
        print dataArray
        print 'Reduced data:'
        print data_reduced
    else:
        print 'Error: No Reduction Method Specified!!!' 
    ####################################
	#  End of Dimensionality Reduction #
    ####################################
    print 'data_reduced dimension:', data_reduced.shape
    writeCache(args['outputDir']+outputFilename, data_reduced)
    writeTimestamp(args['outputDir']+'timestamp', t)
    print 'Output file:', outputFilename 
    print 'Done'

## Principal component analysis

#### Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components.The number of principal components is less than or equal to the number of original variables. This transformation is defined in such a way that the first principal component has the largest possible variance (that is, accounts for as much of the variability in the data as possible), and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to (i.e., uncorrelated with) the preceding components. The principal components are orthogonal because they are the eigenvectors of the covariance matrix, which is symmetric. PCA is sensitive to the relative scaling of the original variables.

##### The main purposes of a principal component analysis are the analysis of data to identify patterns and finding patterns to reduce the dimensions of the dataset with minimal loss of information.

# In[292]:

from matplotlib.mlab import PCA as mlabPCA


# In[295]:

mlab_pca = mlabPCA(train)
mlab_pca


# In[296]:

mlab_pca.Y


# In[298]:

mlab_pca.Y.shape


# In[299]:
Пример #36
0
import numpy as np
from matplotlib.mlab import PCA as mlabPCA
import matplotlib.pyplot as plt
from load_data import read_data
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d

all_samples = read_data("data/train1.csv")

y_train = np.array([x[0] for x in all_samples])
X_train = np.array([x[1:] for x in all_samples])

data_array = X_train
mlab_pca = mlabPCA(data_array)

Class0 = [i for i in range(len(y_train)) if y_train[i] == 0]
Class1 = [i for i in range(len(y_train)) if y_train[i] == 1]

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.plot(mlab_pca.Y[Class0, 0],
        mlab_pca.Y[Class0, 1],
        mlab_pca.Y[Class0, 2],
        'o',
        markersize=8,
        color='blue',
        alpha=0.5,
        label='class1')
ax.plot(mlab_pca.Y[Class1, 0],
        mlab_pca.Y[Class1, 1],
        mlab_pca.Y[Class1, 2],
Пример #37
0
    print("Component Number: " + str(each))
    print("Components" + str(result.components_))
    print("Explained Variance" + str(result.explained_variance_))
    print("Explained Variance Ration" + str(result.explained_variance_ratio_))
    print("PCA Score" + str(result.score(traindata, y=None)))

    t1 = time.clock()
    timetaken = str(t1 - t0)
    print("Computation Time" + timetaken)

#Graphical Representation of n = 3
# Fit the PCA analysis
result = PCA(n_components=3).fit(traindata)
from matplotlib.mlab import PCA as mlabPCA

mlab_pca = mlabPCA(traindata)

# print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt)
print(mlab_pca.Y.shape)
plt.plot(mlab_pca.Y[0:50, 0],
         mlab_pca.Y[0:50, 1],
         'o',
         markersize=7,
         color='blue',
         alpha=0.5,
         label='class1')
plt.plot(mlab_pca.Y[50:100, 0],
         mlab_pca.Y[50:100, 1],
         '^',
         markersize=7,
         color='red',
Пример #38
0
    def DimReduction(self, varToKeep, response_data, rawFeatureList):
        response_rowNum = len(response_data)  #length of response
        Go.featureList = numpy.empty(Go.fileNum, dtype=numpy.ndarray)
        scoreList = [0] * Go.fileNum
        PCNoList = numpy.empty(Go.fileNum, dtype=int)
        coeffList = [0] * Go.fileNum
        os.chdir(Go.currentPath + Go.dataFiles + Go.outputFolder)
        Go.featureListArray = numpy.empty(Go.fileNum, dtype=numpy.ndarray)
        for i in range(Go.fileNum):
            Go.featureList[i] = stats.zscore(rawFeatureList[i])
            #if i == 0:
            #    Go.featureListArray = Go.featureList[i]
            #else:
            #    Go.featureListArray = numpy.vstack((Go.featureListArray, Go.featureList[i]))

        VarianceIncluded = "Variance Included is: "
        for featNum in range(Go.fileNum):
            #print ("===Sumit:===",featNum,"++",Go.featureList[featNum])
            PCAobject = mlabPCA(Go.featureList[featNum], standardize=False)
            explained = 100 * PCAobject.fracs  # this is correct
            coeff = PCAobject.Wt.T  #this is correct, except last column has +/- signs switched
            score = PCAobject.Y  #same issue as coeff (but i dont think its significant?)

            i = 0
            j = 0
            k = 0

            while i < len(explained):
                j = j + explained[i]
                k = i
                if j > varToKeep:
                    break
                i += 1

            scoreList[featNum] = score[:, 0:k + 1]
            coeffList[featNum] = coeff[:, 0:k + 1]
            PCNoList[featNum] = k + 1
            '''
            print("Coeff is ")
            print(coeff)
            print("Score is:")
            print(score)
            print("Explained is:")
            print(explained)
            '''
            string1 = 'CoeffMatrix' + Go.files[featNum]
            string2 = 'ScoreMatrix' + Go.files[featNum]

            file1 = open(string1, 'wb')
            wr1 = csv.writer(file1, quoting=csv.QUOTE_ALL)

            numpy.savetxt(string1, coeffList[featNum], delimiter=",")

            file2 = open(string2, 'wb')
            wr2 = csv.writer(file2, quoting=csv.QUOTE_ALL)
            numpy.savetxt(string2, scoreList[featNum], delimiter=",")

        PCNumTOTAL = sum(PCNoList)
        PCNumCum = numpy.cumsum(PCNoList)

        file_PCNumCum = open('PCNumCum.csv', 'wb')
        wr3 = csv.writer(file_PCNumCum, quoting=csv.QUOTE_ALL)

        numpy.savetxt('PCNumCum.csv', PCNumCum, delimiter=",")

        scoreTotal = numpy.zeros((response_rowNum, PCNumTOTAL))

        x = 0

        for i in range(0, Go.fileNum):
            numRowsScoreList = len(scoreList[i])
            numColScoreList = len(scoreList[i][0])
            print(numColScoreList)
            scoreTotal[:, x:x + numColScoreList] = scoreList[i]
            x += numColScoreList

        file_PCScoreTotal = open('PCScoreTotal.csv', 'wb')
        wr4 = csv.writer(file_PCScoreTotal, quoting=csv.QUOTE_ALL)
        numpy.savetxt('PCScoreTotal.csv', scoreTotal, delimiter=",")

        if featNum == 0:
            VarianceIncluded += str(j)
        else:
            VarianceIncluded += ", " + str(j)

        return {'VarianceIncluded': VarianceIncluded, 'scoreTotal': scoreTotal}