示例#1
0
def main():
    # get dataset
    data = pd.read_pickle("cluster.pkl")
    data_np = data.values

    # get mean
    mean = PCA.calc_mean(data_np)
    U = PCA.getU("PCA_eigen_cluster.pkl")
    # get error for data space
    error = []
    featureSpace = []
    prevError = sys.maxint
    reconstructError = 0.0
    k = 0
    # find smallest feature space to reduce data set
    for k in range(10):
        print "k: " + str(k)
        prevError = reconstructError
        newSpace, eigen_vectors = PCA.reduce(data_np, k, U.values, mean)
        reconstructError = PCA.reconstruction_error(newSpace, data_np, eigen_vectors, mean, k)
        print "reconstr error: " + str(reconstructError)
        error.append(reconstructError)
        featureSpace.append(k)
    print "Smallest feature space size: " + str(k)
    plt.plot(featureSpace, error, marker=".")
    plt.ylabel("Reconstruction Error")
    plt.xlabel("Size of Reduced Feature Space")
    plt.title("Size of Reduced Feature Space vs Reconstruction Error")
    plt.savefig("Error for PCA Cluster")
示例#2
0
def main():
    global group_num
    # get parser
    k = int(sys.argv[1])
    print 1
    print 2
    train = pd.read_pickle("cluster.pkl")
    reduced_train = PCA.reduce(train.values, 50,
                               PCA.getU("PCA_eigen_cluster.pkl").values,
                               PCA.calc_mean(train.values))
    print 3
    cluster_center, cluster_idx = cluster(reduced_train, k)
    print 4
    print cluster_center
    print cluster_center.shape
    print cluster_idx
    print 5
    articles = train.index.values
    groupings = {}
    for i in range(k):
        group_num = i
        b = np.apply_along_axis(isInGroup, 0, cluster_idx)
        groupings[i] = articles[b]
    print(groupings)
    for key in groupings:
        print groupings[key].shape
示例#3
0
    def confirm_PCA(self, event):

        self.fig_PCA.clear()
        self.checkedPCAStrings = self.PCA_selection.GetCheckedStrings()
        self.pca_color = self.color_on_pca.GetValue()
        self.pca_shape = self.shape_on_pca.GetValue()

        self.key_headers_PCA = [
            self.x_axis_selection.GetValue(),
            self.y_axis_selection.GetValue(), self.checkedPCAStrings,
            self.pca_color,
            self.size_slider_PCA.GetValue(), self.pca_shape,
            self.label_points_pca.GetValue()
        ]
        if self.data_been_filtered:
            self.fig_PCA = pca.pca_(self.dataFiltered, self.key_headers_PCA,
                                    self.fig_PCA, self.CLR_check.GetValue(),
                                    self.arrows_check.GetValue(),
                                    self.samples_check.GetValue(),
                                    self.colordict, self.shapedict)
        else:

            self.fig_PCA = pca.pca_(self.data, self.key_headers_PCA,
                                    self.fig_PCA, self.CLR_check.GetValue(),
                                    self.arrows_check.GetValue(),
                                    self.samples_check.GetValue(),
                                    self.colordict, self.shapedict)
        self.PCA_plot = self.fig_PCA
        self.canvas2.draw()
        self.confirm_btn_PCA.SetLabelText("Update Graph")
        self.PCA_button.Enable(True)
示例#4
0
def eigenface(trainData, testData, dataVariety):
    # standardize train data
    dropTrainData = trainData.drop("variety", axis=1)
    trainMean = dropTrainData.sum()
    trainMean = trainMean.values.reshape([dropTrainData.shape[1], 1])
    trainMean = trainMean / dropTrainData.shape[0]
    newtrainData = PCA.normalize(trainData, trainMean)

    # calculate xT * x and its eigenvector
    normTrainData = newtrainData.drop("variety", axis=1)
    normTrainData = np.array(normTrainData)
    X = np.transpose(normTrainData)

    tempMat = np.zeros([X.shape[1], X.shape[1]])
    np.matmul(np.transpose(X), X, tempMat)
    eigValX, eigVecX = np.linalg.eigh(tempMat)

    # calculate X * eigenvector
    newEigVecX = np.zeros([X.shape[0], eigVecX.shape[1]])
    newEigVecX = np.matmul(X, eigVecX)

    # normalize eigenvector
    newEigVecX = np.transpose(newEigVecX)
    length = np.linalg.norm(newEigVecX, axis=1)
    for i in range(newEigVecX.shape[0]):
        newEigVecX[i] /= length[i]
    normEigVec = np.transpose(newEigVecX)


    # calculate A
    L = 20
    maxEigIdx = np.argsort(-eigValX)
    A = []
    for i in range(L):
        A.append(normEigVec[:, maxEigIdx[i]])
    A = np.array(A)
    A = np.transpose(A)

    newtestData = PCA.normalize(testData, trainMean)

    # projection of train data
    projTrainFrame = PCA.project(A, newtrainData)

    # projection of test data
    projTestFrame = PCA.project(A, newtestData)

    # # classify test data by likelihood
    # g1, testIdx1, success1, confusion_mat1 = Likelihood.likelihood(projTrainFrame, projTestFrame, dataVariety)
    # Header.calAccuracy(success1, projTestFrame)
    # Header.ROC_AUC(projTestFrame, dataVariety, g1, testIdx1)
    # Header.drawConfusionMat(confusion_mat1, dataVariety)

    # classify test data by bayes
    names = []
    for i in range(projTestFrame.shape[1] - 1):
        names.append('0')
    names.append('variety')
    g2, testIdx2, success2, confusion_mat2 = Bayes.bayes(projTrainFrame, projTestFrame, dataVariety, names)
    Header.calAccuracy(success2, projTestFrame)
    Header.drawConfusionMat(confusion_mat2, dataVariety)
示例#5
0
def main(runIndex=None):
    print("Starting Main.main()")

    # if the required directory structure doesn't exist, create it
    makeDirectoryStructure(address)

    # now start the GMM process
    Load.main(address, filename_raw_data, runIndex, subsample_uniform,\
              subsample_random, subsample_inTime, grid, conc, \
              fraction_train, inTime_start, inTime_finish,\
              fraction_nan_samples, fraction_nan_depths, cov_type,\
              run_bic=False)

    # loads data, selects train, cleans, centres/standardises, prints
    PCA.create(address, runIndex, n_dimen, use_fPCA)
    GMM.create(address, runIndex, n_comp, cov_type)
    PCA.apply(address, runIndex)
    GMM.apply(address, runIndex, n_comp)

    # reconstruction (back into depth space)
    Reconstruct.gmm_reconstruct(address, runIndex, n_comp)
    Reconstruct.full_reconstruct(address, runIndex)
    Reconstruct.train_reconstruct(address, runIndex)

    # calculate properties
    mainProperties(address, runIndex, n_comp)
示例#6
0
    def pca(self, X, dim=25):
        """
        进行PCA降维
        :param X: 图片
        :param dim: 将维后图片维度
        """

        pca = PCA(X)
        output = pca.reduction(dim=25)

        return output
示例#7
0
def pcaOnMnist(training, dimension=700):
  principalComponents = PCA.pca(training, dimension)
  low, same = PCA.reduce(principalComponents, training)

  image2DInitial = vectorToImage(training[0], (28,28))
  print same[0].shape
  image2D = vectorToImage(same[0], (28,28))

  plt.imshow(image2DInitial, cmap=plt.cm.gray)
  plt.show()
  plt.imshow(image2D, cmap=plt.cm.gray)
  plt.show()
  print "done"
示例#8
0
def prepareFMNISTData(scale=0, PCA_threshold=-1, Whitening=0, PCA_p=None):
    mndata = MNIST('fashion_data')
    imagesTrain, labelsTrain = mndata.load_training()
    imagesTest, labelsTest = mndata.load_testing()

    X_test = np.array(imagesTest)
    y_test = np.array(labelsTest)

    n = len(imagesTrain)
    np.random.seed(RANDOM_SEED)
    indices = np.random.permutation(n)

    trainingIndex = indices[:int(4 * n / 5)]
    validationIndex = indices[int(4 * n / 5):]

    X_train = np.array(imagesTrain)[trainingIndex]
    y_train = np.array(labelsTrain)[trainingIndex]

    X_val = np.array(imagesTrain)[validationIndex]
    y_val = np.array(labelsTrain)[validationIndex]

    if (PCA_threshold != -1):

        [Z_train, p, Xr, U, W] = PCA(X_train, PCA_threshold)
        if PCA_p is not None: p = PCA_p
        [Z_test, Xr] = project(X_test, U, p)
        [Z_val, Xr] = project(X_val, U, p)
        X_train = Z_train[:, :p]
        X_val = Z_val[:, :p]
        X_test = Z_test[:, :p]
        print("PCA_Threshold = " + str(PCA_threshold) + ", P = " + str(p))

    if (scale == 1):
        mean = np.mean(X_train, axis=0)
        X_train = X_train - mean
        X_test = X_test - mean
        X_val = X_val - mean

        variance = np.var(X_train, axis=0)
        X_train = X_train / np.sqrt(variance)
        X_test = X_test / np.sqrt(variance)
        X_val = X_val / np.sqrt(variance)

    if (Whitening == 1):
        [Z, p, X3, U, W] = PCA(X_train, 1.0)
        X_train = whiteningTransform(X_train, W, U)
        X_test = whiteningTransform(X_test, W, U)
        X_val = whiteningTransform(X_val, W, U)

    return (X_train, y_train, X_val, y_val, X_test, y_test)
示例#9
0
def main():
    train = pd.read_pickle("cluster.pkl")
    reduced_data = PCA.reduce(train.values, 50,
                              PCA.getU("PCA_eigen_cluster.pkl").values,
                              PCA.calc_mean(train.values))
    heterogeneity_k_means = []
    heterogeneity_spectral = []
    ks = range(1, 51)
    spectral_laplacian = spectral.setup(train.values)
    for k in ks:
        print "k: " + str(k)
        bestSSD_k_means = sys.maxint
        bestSSD_spectral = sys.maxint
        spectral_eigen = spectral.computeEigen(spectral_laplacian, k)
        # do clustering 3 times for each k
        for i in range(5):
            print "i: " + str(i)
            print "k_means"
            cluster_center_k_means, cluster_idx_k_means = k_means.cluster(
                reduced_data, k)
            ssd_k_means = SSD(reduced_data, cluster_center_k_means,
                              cluster_idx_k_means)
            if ssd_k_means < bestSSD_k_means:
                bestSSD_k_means = ssd_k_means
            print "Spectral"
            cluster_center_spectral, cluster_idx_spectral = spectral.cluster(
                spectral_eigen, k)
            ssd_spectral = SSD(spectral_eigen, cluster_center_spectral,
                               cluster_idx_spectral)
            if ssd_spectral < bestSSD_spectral:
                bestSSD_spectral = ssd_spectral
        # append best ssd
        heterogeneity_k_means.append(bestSSD_k_means)
        heterogeneity_spectral.append(bestSSD_spectral)
    plt.figure(1)
    plt.plot(ks, heterogeneity_k_means, marker=".")
    plt.ylabel("Heterogeneity")
    plt.xlabel("k")
    plt.title("k vs Heterogeneity for k means")
    plt.xticks(np.arange(0, max(ks), 2.0))
    plt.savefig("heterogeneity_k_means_cluster.png")
    plt.figure(2)
    plt.plot(ks, heterogeneity_spectral, marker=".")
    plt.ylabel("Heterogeneity")
    plt.xlabel("k")
    plt.title("k vs Heterogeneity for spectral")
    plt.xticks(np.arange(0, max(ks), 2.0))
    plt.savefig("heterogeneity_spectral_cluster.png")
示例#10
0
def loadProjectPCA():
    saveFileName = askopenfilename(initialdir = "/",title = "Select file",filetypes = (("Phenotype Files","*"),("all files","*.*")))
    saveFileObject = open(saveFileName)

    i = 0;
    next = saveFileObject.readline()
    PCADataRead = next
    PCADataRead = PCADataRead.strip()

    #reads through the text file and takes out the save data
    while i < 5:
        if(i == 0):
            PCAPhenoRead = saveFileObject.readline()
            PCAPhenoRead = PCAPhenoRead.rstrip('\n')
        if(i == 1):
            columnPCAEvec1read = saveFileObject.readline()
            columnPCAEvec1read = columnPCAEvec1read.strip()
        if(i == 2):
            columnPCAEvec2read = saveFileObject.readline()
            columnPCAEvec2read = columnPCAEvec2read.strip()
        if(i == 3):
            columnPCAEvec3read = saveFileObject.readline()
            columnPCAEvec3read = columnPCAEvec3read.strip()
        if(i == 4):
            columnPCAPhenoRead = saveFileObject.readline()
            columnPCAPhenoRead = columnPCAPhenoRead.strip()
        i = i + 1

    #Creates a new plot when loaded
    PCAPlotterLoad = PCA.PCAPlotter()
    PCAPlotterLoad.readFile1(PCAPhenoRead)
    PCAPlotterLoad.readFile2(PCADataRead)
    PCAPlotterLoad.connectFilesAddColour(int(columnPCAEvec1read), int(columnPCAEvec2read), int(columnPCAEvec3read), int(columnPCAPhenoRead))
    PCAPlotterLoad.plotGraph()
示例#11
0
    def prepare_data(self, test_data_perc=0.2):
        self.data = shuffle(self.data)
        self.data_np_arr1 = self.data.values
        self.features = np.shape(self.data_np_arr1)[1] - 1

        if self.pca_decompose:
            self.PCAObj = PCA.PCADecompose(self.num_feature_to_decompose)
            d_x = self.data_np_arr1[:, 0:self.features]
            d_y = self.data_np_arr1[:, self.features:]
            data_new = self.PCAObj.transform_data(d_x, d_y)
            self.data_np_arr = data_new
            self.features = np.shape(self.data_np_arr)[1] - 1
        else:
            self.data_np_arr = self.data_np_arr1

        train, test = train_test_split(self.data_np_arr,
                                       test_size=test_data_perc)
        self.X_train = train[:, 0:self.features]
        self.Y_train = train[:, self.features:]
        validation, test = train_test_split(test, test_size=0.5)
        self.X_validation = validation[:, 0:self.features]
        self.Y_validation = validation[:, self.features:]
        self.X_test = test[:, 0:self.features]
        self.Y_test = test[:, self.features:]

        print("X Train shape ", np.shape(self.X_train))
        print("Y Train shape ", np.shape(self.Y_train))
        print("X Validation shape ", np.shape(self.X_validation))
        print("Y Validation shape ", np.shape(self.Y_validation))
        print("X Test shape ", np.shape(self.X_test))
        print("Y Test shape ", np.shape(self.Y_test))
示例#12
0
    def treat_data(self, data):

        data, name = data.drop(
            ['participant'], axis=1).as_matrix(), data['participant'].tolist()
        data = PCA.PCA(data)  # PCA it

        return data, name
示例#13
0
def callPCA(dataset, attrNum, k):
    X, y = g.splitXandY(dataset, attrNum, len(dataset))
    print(k)
    finalData, reconMat = PCA.pca(X, k)

    # PCA.plotBestFit(finalData, reconMat, y)
    return np.hstack((finalData, y)), np.hstack((reconMat, y))
示例#14
0
def read(PCA_v = True, covariances = None, begin=0, end=10):
  prev = None
  X    = None
  Y    = None

  for i in range(begin,end):
      sample  = dirList_2[i][-10:]
      mat_string = dirList_2[i] + sample + '.mat'
      arousal_string = dirList_2[i] + sample
      print("Working on sample nr: ",  i, )
      
      x = sio.loadmat(mat_string)['val']

      if (PCA_v):
        ann = wfdb.rdann(arousal_string, 'arousal')
        prev = PCA.get_matrices(x,ann,prev)
      
      else:
        arousal_string += '-arousal.mat'     
        f = h5py.File(arousal_string, 'r')
        y = f['data']['arousals'][:]
        X,Y = extract_features(x,np.transpose(y),covariances,X,Y)
  print("-----------------------")

  if(PCA_v):
    return prev
  return X,Y      
示例#15
0
def construct_mnist():
    # 主要成分
    K = 1
    # 手写数字
    num = 9
    # 样本数量
    N = 100
    print('read from MNIST_test.txt...')
    data = np.loadtxt('dataset/MNIST_test.txt', delimiter=',')
    # 切分 标签和 特征
    Y = data[:, 0]
    X = data[:, 1:]
    ######单一数字######
    # 获得某个手写数字的所有下标
    indices = np.argwhere(Y == num)
    # 获得所有该数字的样本
    X_n = X[indices][:N]
    # 展示原始图片
    slice_imgs(X_n, 'original')

    # 主成分分析 特征重建
    X_n_k, re_X_n = PCA(np.asarray(X_n).reshape((N, 784)), K)

    # 展示重建图片
    slice_imgs(np.real(re_X_n), 'reconstruct')

    # 每张图片的信噪比
    print('SNR of each picture...')
    print([compute_SNR(X_n[i], re_X_n[i]) for i in range(N)])
示例#16
0
def preprocess(feature_abstract_method):
    # X_raw = raw_data.iloc[:, 1:]
    # y_raw = raw_data['label']
    # X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2)
    # X_train.to_csv('x_train.csv')
    # X_test.to_csv('x_test.csv')
    # y_train.to_csv('y_train.csv')
    # y_test.to_csv('y_test.csv')
    X_train = pd.read_csv('x_train.csv', index_col=0)
    X_test = pd.read_csv('x_test.csv', index_col=0)
    y_train = pd.read_csv('y_train.csv', index_col=0, header=None)
    y_test = pd.read_csv('y_test.csv', index_col=0, header=None)
    if (feature_abstract_method == 'LBP'):
        X_train = LBP.lbp_extract(X_train)
        X_test = LBP.lbp_extract(X_test)
    elif (feature_abstract_method == 'PCA'):
        X_train, X_test = PCA.PCA_extract(X_train, X_test)
    elif (feature_abstract_method == 'skeleton'):
        X_train = SKELETON.skeleton_extract(X_train)
        X_test = SKELETON.skeleton_extract(X_test)
    elif (feature_abstract_method == 'grid'):
        X_train = GRID.grid_extract(X_train)
        X_test = GRID.grid_extract(X_test)
    elif (feature_abstract_method == 'hog'):
        X_train = HOG.hog_extract(X_train)
        X_test = HOG.hog_extract(X_test)
    return X_train, X_test, y_train, y_test
def get_feature_mattrix():
    meal_files = [
        'MealNoMealData/mealData1.csv', 'MealNoMealData/mealData2.csv',
        'MealNoMealData/mealData3.csv', 'MealNoMealData/mealData4.csv',
        'MealNoMealData/mealData5.csv',
    ]


    meal_data = parse_and_interpolate(meal_files)


    data = meal_data[0]
    fft_features = get_fft_features(data)
    entropy_feature = get_entropy(data)
    moving_avg_features = moving_avg(data)
    normal_skew_feature = normal_skew(data)

    for index in range(1, len(meal_data)):
        data = meal_data[index]

        fft_features = np.concatenate((fft_features, get_fft_features(data)), axis=0)
        moving_avg_features = np.concatenate((moving_avg_features, moving_avg(data)), axis=0)
        entropy_feature = np.concatenate((entropy_feature, get_entropy(data)), axis=0)
        normal_skew_feature = np.concatenate((normal_skew_feature, normal_skew(data)), axis=0)

    feature_mattrix = np.concatenate((moving_avg_features, entropy_feature, fft_features, normal_skew_feature), axis=1)
    np.set_printoptions(suppress=True)

    PCA = p.cal_PCA()
    feature_mattrix = PCA.performPCA(feature_mattrix)

    return feature_mattrix, PCA
示例#18
0
def getPC(coords, outFileName='ligBox.pdb'):
    size = coords.size
    shape = coords.shape
    if shape != (1, 3):
        if size != 0:
            eigenVectors, eigenValues = PCA.princomp(coords.T,
                                                     numpc=3,
                                                     getEigenValues=True)
            com = coords.mean(axis=0)
            projection = numpy.dot(coords - com, eigenVectors)
            signs = numpy.sign(numpy.sign(projection).sum(axis=0))
            signs2 = numpy.sign(
                projection[numpy.abs(projection).argmax(axis=0)].diagonal())
            signs[signs == 0] = signs2[signs == 0]
            eigenVectors = eigenVectors * signs
            vectors = com + eigenVectors.T * numpy.atleast_2d(
                numpy.sqrt(eigenValues)).T
        elif size == 0:
            com = numpy.zeros((3))
            vectors = numpy.zeros((3, 3))
    else:
        com = coords.flatten()
        vectors = numpy.zeros((3, 3))


# pdbBoxWriter(com, vectors, outFileName)
    return com, vectors
示例#19
0
def draw_2d():
    x2 = PCA(data_set.x, 2)

    plt.figure()
    plt.scatter(x2[0, :50],
                x2[1, :50],
                marker='x',
                color='m',
                s=30,
                label='Iris-setosa')
    plt.scatter(x2[0, 50:100],
                x2[1, 50:100],
                marker='+',
                color='c',
                s=50,
                label='Iris-versicolor')
    plt.scatter(x2[0, 100:150],
                x2[1, 100:150],
                marker='o',
                color='r',
                s=15,
                label='Iris-virginica')
    plt.legend()
    plt.title('PCA of IRIS k = 2')
    plt.show()
def small_data_test():
    test_1 = PCA.pca_function([[1,0,1,1],[0,1,2,0],[1,1,2,0],[0,1,2,1]],2)  
    result = [[0.8333, -0.5, -0.1666, -0.1666], [0.0, 0.0, 0.7071, -0.7071]]
    if test_1:
        return("All OK")
    else:
        return("something went wrong")
示例#21
0
def main():
    data = pd.read_csv('/Users/bytedance/Desktop/AI/data/wine.data.csv')
    label = data["0"].to_numpy()
    del data["0"]

    data = data / data.max(axis=0)  # normalize
    data = data.to_numpy()

    # PCA
    K = 3
    for thresh in [0.9, 0.8, 0.7, 0.6, 0.5]:
        new_data, _, _ = PCA.PCA(data.T, 2, True, thresh)

        ndim = new_data.shape[1]
        print(
            f"======== kmeans, K = {K}, ndim = {ndim}, thresh = {thresh} ========="
        )

        if ndim == 2:
            plt.figure(1)
            plt.scatter(new_data[:, 0], new_data[:, 1], s=50)

        S, RI, predicted_label = Kmeans.test_kmeans(new_data, label, K)
        df_data = pd.DataFrame(new_data)
        df_label = pd.DataFrame(predicted_label)
        result_df = pd.concat([df_label, df_data], axis=1)
        result_df.to_csv(f"./result_ndim{ndim}_K{K}.csv")
示例#22
0
文件: EXTRAS.py 项目: zerozzl/MLStudy
def plotTestSet3(filepath):
    n = 1000  # number of points to create
    xcord0 = []; ycord0 = []
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    markers = []
    colors = []
    fw = open(filepath, 'w')
    for i in range(n):
        groupNum = int(3 * numpy.random.uniform())
        [r0, r1] = numpy.random.standard_normal(2)
        if groupNum == 0:
            x = r0 + 16.0
            y = 1.0 * r1 + x
            xcord0.append(x)
            ycord0.append(y)
        elif groupNum == 1:
            x = r0 + 8.0
            y = 1.0 * r1 + x
            xcord1.append(x)
            ycord1.append(y)
        elif groupNum == 2:
            x = r0 + 0.0
            y = 1.0 * r1 + x
            xcord2.append(x)
            ycord2.append(y)
        fw.write("%f\t%f\t%d\n" % (x, y, groupNum))
    
    fw.close()
    fig = plt.figure()
    ax = fig.add_subplot(211)
    ax.scatter(xcord0, ycord0, marker='^', s=90)
    ax.scatter(xcord1, ycord1, marker='o', s=50, c='red')
    ax.scatter(xcord2, ycord2, marker='v', s=50, c='yellow')
    ax = fig.add_subplot(212)
    myDat = PCA.loadDataSet(filepath)
    lowDDat, reconDat = PCA.pca(myDat[:, 0:2], 1)
    label0Mat = lowDDat[numpy.nonzero(myDat[:, 2] == 0)[0], :2][0]  # get the items with label 0
    label1Mat = lowDDat[numpy.nonzero(myDat[:, 2] == 1)[0], :2][0]  # get the items with label 1
    label2Mat = lowDDat[numpy.nonzero(myDat[:, 2] == 2)[0], :2][0]  # get the items with label 2
    # ax.scatter(label0Mat[:,0],label0Mat[:,1], marker='^', s=90)
    # ax.scatter(label1Mat[:,0],label1Mat[:,1], marker='o', s=50,  c='red')
    # ax.scatter(label2Mat[:,0],label2Mat[:,1], marker='v', s=50,  c='yellow')
    ax.scatter(label0Mat[:, 0], numpy.zeros(numpy.shape(label0Mat)[0]), marker='^', s=90)
    ax.scatter(label1Mat[:, 0], numpy.zeros(numpy.shape(label1Mat)[0]), marker='o', s=50, c='red')
    ax.scatter(label2Mat[:, 0], numpy.zeros(numpy.shape(label2Mat)[0]), marker='v', s=50, c='yellow')
    plt.show()
示例#23
0
def predict(data, components):
    pca, train_features, train_results, test_features, test_results, values = PCA.transform(
        components)
    clf = svm.SVC(kernel="rbf", gamma='auto', probability=True)
    PCA_data = pca.transform(data)
    clf.fit(train_features, train_results)
    outcome = clf.predict_proba(PCA_data)
    return (outcome, test_features, test_results, values)
示例#24
0
def predict(data, components):
    pca, train_features, train_targets, test_features, test_results, values = PCA.transform(
        components)
    model = GaussianNB()
    # Train the model using the training sets
    model.fit(train_features, train_targets)
    PCA_data = pca.transform(data)
    predicted = model.predict_proba(PCA_data)
    return (predicted, test_features, test_results, values)
示例#25
0
def main():
    global group_num
    k = int(sys.argv[1])
    train = pd.read_pickle("tfidf_small.pkl")
    reduced_data = PCA.reduce(train.values, 50, PCA.getU("PCA_eigen_cluster.pkl").values, PCA.calc_mean(train.values))
    laplacian = setup(train.values)
    eigen_vectors = computeEigen(laplacian, k)
    cluster_center, cluster_idx = cluster(eigen_vectors, k)
    # display the data:
    articles = train.index.values
    groupings = {}
    for i in range(k):
        group_num = i
        b = np.apply_along_axis(isInGroup, 0, cluster_idx)
        groupings[i] = articles[b]
    print(groupings)
    for key in groupings:
        print groupings[key].shape
示例#26
0
def main(run=None):
    print("Starting Main.main()")  
    
    # Now start the GMM process
    Load.main(address, dir_raw_data, run, subsample_uniform, subsample_random,\
               subsample_inTime, grid, conc, fraction_train, inTime_start,\
               inTime_finish, fraction_nan_samples, fraction_nan_depths, dtype)
               
    #Load.main(address, filename_raw_data, run, subsample_uniform, subsample_random,\
        # Loads data, selects Train, cleans, centres/standardises, prints
    
    PCA.create(address, run, n_dimen)     # Uses Train to create PCA, prints results, stores object
    GMM.create(address, run, n_comp)      # Uses Train to create GMM, prints results, stores object
   
    PCA.apply(address, run)               # Applies PCA to test dataset     
    GMM.apply(address, run, n_comp)       # Applies GMM to test dataset
    
    # Reconstruction
    Reconstruct.gmm_reconstruct(address, run, n_comp)  # Reconstructs the results in original space
    Reconstruct.full_reconstruct(address, run)
    Reconstruct.train_reconstruct(address, run)

    # new stuff DD 27/08/18, after seeing updates on DJ github
    #mainProperties(address, runIndex, n_comp)

    
    # Plotting -- first commented out DD
    #Plot.plotMapCircular(address, address_fronts, run, n_comp)
    
    #Plot.plotPosterior(address, address_fronts, run, n_comp, plotFronts=True)
    Plot.plotPostZonal(address, run, n_comp, dtype, plotFronts=False) ## zonal frequencies
    #Plot.plotPosterior(address, run, n_comp, dtype, plotFronts=False) ## works but data overlaps spatially...

    Plot.plotProfileClass(address, run, n_comp, dtype, 'uncentred')
    Plot.plotProfileClass(address, run, n_comp, dtype, 'depth')

    Plot.plotGaussiansIndividual(address, run, n_comp, dtype, 'reduced')#uncentred')#'depth')#reduced')
#    Plot.plotGaussiansIndividual(address, run, n_comp, 'depth') # ERROR NOT WOKRING PROPERLY
#    Plot.plotGaussiansIndividual(address, run, n_comp, 'uncentred') # ERROR NOT WOKRING PROPERLY
    
    #Plot.plotProfile(address, run, dtype, 'original') # these run just fine but are huge and unhelpful
    Plot.plotProfile(address, run, dtype, 'uncentred')
    
    Plot.plotWeights(address, run, dtype)
示例#27
0
    def confirm_scatter(self, event):
        self.fig_scatter.clear()
        self.scatter_color = self.color_on_scatter.GetValue()
        self.scatter_shape = self.shape_on_scatter.GetValue()

        size = self.size_slider_scatter.GetValue()
        self.key_headers_scatter = [
            self.scatter_color,
            self.x_name_scatter.GetValue(),
            self.y_name_scatter.GetValue(),
            self.z_name_scatter.GetValue(),
            self.x1_name_scatter.GetValue(),
            self.y1_name_scatter.GetValue(),
            self.z1_name_scatter.GetValue(), size, self.scatter_shape,
            self.label_points_scatter.GetValue()
        ]
        limits = [
            self.xLowLim.GetValue(),
            self.xUpLim.GetValue(),
            self.yLowLim.GetValue(),
            self.yUpLim.GetValue()
        ]
        log_scales = [
            self.scatter_log_x.GetValue(),
            self.scatter_log_y.GetValue()
        ]
        if self.data_been_filtered:
            if not len(self.x_name_scatter.GetValue()) == 0 and not len(
                    self.y_name_scatter.GetValue()) == 0:
                self.fig_scatter = pca.blank_scatter_plot(
                    self.dataFiltered, self.key_headers_scatter, limits,
                    self.fig_scatter, log_scales, self.colordict,
                    self.shapedict)
        else:
            if not len(self.x_name_scatter.GetValue()) == 0 and not len(
                    self.y_name_scatter.GetValue()) == 0:
                self.fig_scatter = pca.blank_scatter_plot(
                    self.data, self.key_headers_scatter, limits,
                    self.fig_scatter, log_scales, self.colordict,
                    self.shapedict)
        self.scatter_plot = self.fig_scatter
        self.canvas3.draw()
        self.confirm_btn_scatter.SetLabelText("Update Graph")
        self.scatter_button.Enable(True)
def main():
    percentages = dict()
    for PC in range(1, 274):
        PCA.init(PC)
        img = cv2.imread("c1.jpg", cv2.IMREAD_GRAYSCALE)
        kp1, des1 = get_descriptors(img)

        img2 = cv2.imread("c2.jpg", cv2.IMREAD_GRAYSCALE)
        kp2, des2 = get_descriptors(img2)

        # Matching between descriptors
        bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        matches = sorted(bf.match(des1, des2),
                         key=lambda match: match.distance)
        # # Plot keypoints
        # img4 = cv2.drawKeypoints(img, kp1, outImage=None)
        # img5 = cv2.drawKeypoints(img2, kp2, outImage=None)
        # f, axarr = plt.subplots(1, 2)
        # axarr[0].imshow(img4)
        # axarr[1].imshow(img5)
        # # plt.show()
        # # Plot matches
        # img3 = cv2.drawMatches(img, kp1, img2, kp2, matches, flags=2, outImg=None)
        # plt.imshow(img3)
        # # plt.show()

        # Calculate score
        score = 0
        for match in matches:
            score += match.distance
        score_threshold = 33
        k = 100 - (score / len(matches))
        if score / len(matches) < score_threshold:
            print("PC " + str(PC) + ": Matches with " + str(k) + "%")
            percentages.update({PC: (k, "Yes")})
        else:
            print("PC " + str(PC) + ": No Match with " + str(k) + "%")
            percentages.update({PC: (k, "No")})

    pickle_out = open("percentages.pickle", "wb")
    pickle.dump(percentages, pickle_out)
    pickle_out.close()

    print('It took', time.time() - start, 'seconds.')
示例#29
0
def test_PCA():
    X = np.empty((100, 2))
    X[:, 0] = np.random.uniform(0., 100., size=100)
    X[:, 1] = 0.75 * X[:, 0] + 3. + np.random.normal(0, 10., size=100)
    pca = PCA(n_components=2)
    pca.fit(X)
    print(pca.components_)

    # 降维
    pca = PCA(n_components=1)
    pca.fit(X)
    X_reduction = pca.transform(X)
    print(X_reduction.shape)
    X_restore = pca.inverse_transform(X_reduction)
    print(X_restore.shape)

    plt.scatter(X[:, 0], X[:, 1], color='b')
    plt.scatter(X_restore[:, 0], X_restore[:, 1], color='r', alpha=0.5)
    plt.show()
示例#30
0
def RunTrainLDA(infile, pcaFile, ldaFile):

    import cPickle

    fp = open(infile, "r")
    dataset = cPickle.load(fp)
    subjID = cPickle.load(fp)
    fp.close()

    pca = PCA(dataset)
    pca_proj = pca.compute()

    np.save(pcaFile, pca_proj)

    lda_proj = []
    lda = LDA(dataset, subjID, pca_proj)
    projData = lda.projectData()
    lda_proj = lda.train(projData)

    np.save(ldaFile, lda_proj)
示例#31
0
def pca_and_call(features=all_features,
                 fn=using_distance_to_original,
                 dim=2,
                 k=-1):
    data = np.array([f[1] for f in features])
    # Note: this warps the variable data
    data_rescaled = PCA.PCA(data, dim)
    features = [(features[i][0], data_rescaled[i])
                for i in range(len(features))]
    if k > 0:
        return fn(features, k)
    return fn(features)
示例#32
0
def pcaOnMnist(training, dimension=700):
  mean, principalComponents = PCA.pca(training, dimension)
  low, same = PCA.reduce(principalComponents, training, mean, noSame=False)

  print "low[0].shape"
  print low[0].shape

  image2DInitial = vectorToImage(training[0], (28,28))
  print same[0].shape
  image2D = vectorToImage(same[0], (28,28))

  image2DLow = vectorToImage(low[0], (20,20))
  plt.imshow(image2DLow, cmap=plt.cm.gray)
  plt.show()


  plt.imshow(image2DInitial, cmap=plt.cm.gray)
  plt.show()
  plt.imshow(image2D, cmap=plt.cm.gray)
  plt.show()
  print "done"
  return low
示例#33
0
def pcaSklearn(training, dimension=700):
  pca = PCA(n_components=dimension)
  pca.fit(training)
  low = pca.transform(training)
  same = pca.inverse_transform(low)

  print "low[0].shape"
  print low[0].shape

  image2DInitial = vectorToImage(training[0], (28,28))
  print same[0].shape
  image2D = vectorToImage(same[0], (28,28))

  image2DLow = vectorToImage(low[0], (20,20))
  plt.imshow(image2DLow, cmap=plt.cm.gray)
  plt.show()


  plt.imshow(image2DInitial, cmap=plt.cm.gray)
  plt.show()
  plt.imshow(image2D, cmap=plt.cm.gray)
  plt.show()
  print "done"
  return low
示例#34
0
def do ( Obs_ij, run_dir ) :

    # PCA
    N_PCs, V_nj, U_in = PCA.do_PCA( Obs_ij, run_dir ) 

    print '# ---------------------------------------------------'
    print '# U_in'
    # print samples
    for ii in xrange( len( U_in ) ):
        for nn in xrange( len( U_in.T ) ):
            print U_in[ii][nn],
        print ''
    print ''

    # shrink wrap
    A_mn = shrinkwrap.do_shrinkwrap ( U_in, N_PCs, run_dir )
示例#35
0
def PlotXference_AVG():
    global EyeData
    global Events
    # Change these into arrays and prelocate size
    inference = []
    noference = []
    for idx in range(0, len(Events)):
        #       inference.append(FindSlices(EyeData[idx], Events[idx], 'Inference', trialTypes))
        #        noference.append(FindSlices(EyeData[idx], Events[idx], 'Noference', trialTypes))
        inference.append(FindSlices(EyeData[idx], Events[idx], "Inference", "typeB", 1))
        noference.append(FindSlices(EyeData[idx], Events[idx], "Noference", "typeA", 1))

    fig = plt.figure()
    fig.suptitle("Gaze X position")
    ax = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)

    for trial in inference:
        ax.plot(trial)
    ax.set_ylim(0, 2000)

    ax.set_ylabel("X coordinate of gaze position")
    ax.set_xlabel("Inference trials \n x time course in ms")
    for trial in noference:
        ax2.plot(trial)
    ax2.set_ylim(0, 2000)
    ax2.set_xlabel("No inference trials \n x time course in ms")

    ticks = ax.get_xticks() * 16
    ax.set_xticklabels(ticks.astype(int))
    ax2.set_xticklabels(ticks.astype(int))

    inf_cat = [1 for i in range(1, len(inference) + 1)]
    nof_cat = [0 for i in range(1, len(noference) + 1)]
    known_cat = np.hstack((np.array(inf_cat), np.array(nof_cat)))
    ferences = np.vstack((inference, noference))
    PlotAverage_X(np.array(inference), np.array(noference))
    #

    components = PCA.myPCA(ferences, known_cat)
    components = components * 1000
    LOG_REG.logReg(known_cat, components)
    # components_tmp = components *1000
    np.savetxt("eda_pcaResults.csv", np.hstack((known_cat.reshape(len(known_cat), 1), components)), delimiter=",")
示例#36
0
文件: EXTRAS.py 项目: zerozzl/MLStudy
def plotSecomPCA(filepath):
    dataMat = PCA.replaceNanWithMean(filepath)
    # below is a quick hack copied from pca.pca()
    meanVals = numpy.mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals  # remove mean
    covMat = numpy.cov(meanRemoved, rowvar=0)
    eigVals, eigVects = numpy.linalg.eig(numpy.mat(covMat))
    eigValInd = numpy.argsort(eigVals)  # sort, sort goes smallest to largest
    eigValInd = eigValInd[::-1]  # reverse
    sortedEigVals = eigVals[eigValInd]
    total = sum(sortedEigVals)
    varPercentage = sortedEigVals / total * 100
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(range(1, 21), varPercentage[:20], marker='^')
    plt.xlabel('Principal Component Number')
    plt.ylabel('Percentage of Variance')
    plt.show()
示例#37
0
def getPC(coords, outFileName='ligBox.pdb'):
 size = coords.size
 shape = coords.shape
 if shape != (1,3):
  if size != 0 :
   eigenVectors, eigenValues = PCA.princomp(coords.T, numpc=3, getEigenValues=True)
   com = coords.mean(axis=0)
   projection = numpy.dot(coords-com,eigenVectors)
   signs = numpy.sign(numpy.sign(projection).sum(axis=0))
   signs2 = numpy.sign(projection[numpy.abs(projection).argmax(axis=0)].diagonal())
   signs[signs==0] = signs2[signs==0]
   eigenVectors = eigenVectors*signs
   vectors = com + eigenVectors.T * numpy.atleast_2d(numpy.sqrt(eigenValues)).T
  elif size == 0:
   com = numpy.zeros((3))
   vectors = numpy.zeros((3,3))
 else:
  com = coords.flatten()
  vectors = numpy.zeros((3,3))
# pdbBoxWriter(com, vectors, outFileName)
 return com, vectors
import numpy as np
import sys
import PCA
import shrinkwrap

INFILE = 'data/raddata_2_norm'

#===================================================
if __name__ == "__main__":

    # input data
    Obs_ij = np.loadtxt(INFILE)
    n_slice = len(Obs_ij)

    # PCA
    N_PCs, V_nj, U_in = PCA.do_PCA( Obs_ij ) 

#    print 'Principal Components (V_lj) : '
#    print V_nj
#    print ''
#    print 'Coefficients (U_il) : '
#    print U_in
#    print ''
    print '# ---------------------------------------------------'
    print '# U_in'
    # print samples
    for ii in xrange( len( U_in ) ):
        for nn in xrange( len( U_in.T ) ):
            print U_in[ii][nn],
        print ''
    print ''
        print '\nERROR : Unknown slice type\n'
        sys.exit()

    # Determine initial values for fitting parameters
    if KNOWN_ANSWER :

        print 'Initial values from known answers'
        X0_albd_kj = np.loadtxt( ALBDFILE ).T
        X0_area_lk = np.loadtxt( AREAFILE )

    else:

        # PCA
        print 'Performing PCA...'
        n_pc, V_nj, U_in, M_j = PCA.do_PCA( Obs_ij, E_cutoff=1e-2, run_dir=run_dir )
        n_type = n_pc + 1

        # shrinkwrap
        print 'Perfoming shrink-wrapping...'
        # N ( = n_PC ): number of principle components
        # M ( = n_PC + 1 ) : number of vertices
        A_mn, P_im   = shrinkwrap.do_shrinkwrap( U_in, n_pc, run_dir=run_dir )
        X0_albd_kj   = np.dot( A_mn, V_nj )
        X0_albd_kj   = X0_albd_kj + M_j
        if ( SLICE_TYPE=='time' ) :
            X0_area_lk   = P_im
        else :
            X0_area_lk = np.ones( n_slice*n_type ).reshape([n_slice, n_type])/(n_type*1.0)

    # Save initial condutions
if __name__ == "__main__":


    # Load input data
    Obs_ij = np.loadtxt( INFILE_DIR + INFILE )
    Time_i  = np.arange( len( Obs_ij ) ) / ( 1.0 * len( Obs_ij ) )
    n_band = len( Obs_ij.T )

    # Initialization of Kernel
    print 'Decomposition into time slices...'
    n_slice = len( Time_i )
    Kernel_il = np.identity( n_slice )

    # PCA
    print 'Performing PCA...'
    n_pc, V_nj, U_in, M_j = PCA.do_PCA( Obs_ij, E_cutoff=1e-2, output=False, run_dir=OUTFILE_DIR )
    V_nj[0] = -1. * V_nj[0]
    U_in.T[0] = -1. * U_in.T[0]
    V_nj[1] = -1. * V_nj[1]
    U_in.T[1] = -1. * U_in.T[1]
    n_type = n_pc + 1
    if n_type != 3 :
        print 'ERROR: This code is only applicable for 3 surface types!'
        sys.exit()

    U_iq = np.c_[ U_in, np.ones( len( U_in ) ) ]

    PC1_limit = [XMIN,XMAX] # manually set for now
    PC2_limit = [YMIN,YMAX] # manually set for now

    points_kn_list     = []
示例#41
0
    data=np.vstack((x,y))

    #mean, eigenvectors = cv2.PCACompute(npc, np.mean(npc, axis=0).reshape(1,-1))
    #mlab_pca = mlabPCA(data.T)
    sklearn_pca = PCA(n_components=2)
    incPCA.append(sklearn_pca.fit_transform(data.T))'''


#print(incPCA[0].components_)

#spaja sve primere prvog sekutica u matricu gde su u jednom redu spojeni vektori koordinata lendmarkova (x,y) a u koloni razliciti primeri 14x80
data1=np.append(Persons[0].Incisors[0].normXY[:,0],Persons[0].Incisors[0].normXY[:,1])
for p in range(1,14):
    data1=np.vstack((data1,np.append(Persons[p].Incisors[0].normXY[:,0],Persons[p].Incisors[0].normXY[:,1])))

eigenvalues, eigenvectors, mu=pca.pcaD(data1,3)


tEVectors=np.array((eigenvectors[0:40,:],eigenvectors[40:80,:])).T

ty=np.array((mu[0:40],mu[40:80])).T

'''
#spaja sve primere prvog sekutica u jednu x,y matricu 560x2
x=np.array([])
y=np.array([])
for p in range(0,14):
    a=Persons[p].Incisors[0].normXY
    x=np.append(x,(a[:,0]))
    y=np.append(y,a[:,1])
data=np.vstack((x,y)).T
示例#42
0

#Part One: Load Example Dataset
print 'One: ======== Load Example Dataset1 ... '
plt.plot(X[:,0],X[:,1],'bo')
plt.axis(xmin=0.5,xmax=6.5,ymin=2,ymax=8)
plt.title('Example Dataset1')


#Part Two: Principal Component Analysis
print 'Two: ================ Running PCA on example dataset...'
result=FN.featureNormalize(X)

X_norm=result[0]
mu=result[1]
res=PCA.pca(X_norm)
U=res[0]
S=res[1]
S=np.eye(S.shape[0])*S


print 'Top eigenvector: '
print 'U[:,0] = %f %f ' % (U[0,0],U[1,0])
print '(You should expect to see -0.707107, -0.707107)'

tmp1=mu+1.5*np.dot(S[0,0],U[:,0].transpose())
tmp2=mu+1.5*np.dot(S[1,1],U[:,1].transpose())

DL.drawLine(mu,tmp1,color='k',linewidth=2)
DL.drawLine(mu,tmp2,color='b',linewidth=2)
plt.show()
示例#43
0
文件: 3dManifold.py 项目: philipz1/ML
		fig.savefig(file_name)
	if display == True:
		plt.show()
	plt.clf()

def graph2d(data, display = True, file_name = None, verbose = True):
	fig, ax = plt.subplots()

	for code in np.unique(data[:,2]):
		x, y = zip(*data[data[:,2] == code][:,0:2])
		ax.scatter(x, y, c = color_convert(code), marker = 'o')

	if file_name != None:
		fig.savefig(file_name)
	if display == True:
		plt.show()
	plt.clf()

'''
Examples, in order, 3d plot of data, PCA, Isomap, LLE, LapEig
'''
# graph3d(np.column_stack((npdata, color_code)))
graph2d(np.column_stack((PCA.pca(npdata, dim = 2), color_code)), False, 'PCA')
graph2d(np.column_stack((Isomap.isomap(npdata, load = 'C.npy'), color_code)), False, 'Isomap')
graph2d(np.column_stack((LLE.lle(npdata), color_code)), False, 'LLE')
graph2d(np.column_stack((LaplacianEigenmap.le(npdata), color_code)), False, 'LaplacianEigenmap')

#Just a sanity check
# from sklearn import manifold
# x = manifold.SpectralEmbedding().fit_transform(X= npdata)
# graph2d(np.column_stack((x, color_code)))
示例#44
0
import PCA
from numpy import *

def loadData (fileAddress) :
	"""
	"""
	file = open (fileAddress).readlines ()
	data = []
	for line in file :
		data.append (map (float , line.strip().split()))
	return mat (data)

if __name__ == '__main__' :
	dataSet = loadData ('testSet.txt')
	lowDimData , newData = PCA.pca (dataSet , 1)
	PCA.showPca (dataSet , lowDimData , newData)
示例#45
0

if 0:
    #
    # PCAT magic: Lifting the following from GMM.py in PCAT
    #
    import PCA, GMM

    score_matrix, principal_components, means, stds, eigenvalues = \
            PCA.PCA(catalogue, components_number=10)

    principal_components_number=10

    reduced_score_matrix = score_matrix[:,:principal_components_number]

    mat, tmp, tmp1 = PCA.matrix_whiten(reduced_score_matrix, std=True)

    #labels = GMM.gaussian_mixture(mat,upper_bound=5)
    labels = GMM.gaussian_mixture(reduced_score_matrix,upper_bound=5)

    colored_clusters = GMM.color_clusters( score_matrix, labels )

    GMM.print_cluster_info(colored_clusters)


#sys.exit()
#
# PCA
#
#H = np.matrix(waveform_catalogue)
H = np.matrix(catalogue)
示例#46
0
	'robberies', 'robbbPerPop',
	'assaults', 'assaultPerPop',
	'burglaries', 'burglPerPop',
	'larcenies', 'larcPerPop',
#	'autoTheft', 'autoTheftPerPop',
	'arsons', 'arsonsPerPop',
	'violentPerPop',
	'nonViolPerPop',
]






from DataSet import *
from PCA import *

dataset = DataSet(data, names, drop_columns=drop_columns, fix_missing=FixMissing.DROPATTRIBUTES, rescale=Rescale.NORMALIZE)
print(dataset.X)

print(dataset.X.iloc[5,10])

pca = PCA(dataset)
pca.plot_rho()
pca.show()
plt.show()

print("\n\nstd:",   dataset.X.std())
print("\n\nmean:",  dataset.X.mean())
print("\n\nrange:", dataset.X.max()-dataset.X.min())
示例#47
0
import vectorizeFiles as VF
import getFileNames as gf
import matplotlib.pyplot as plot
import numpy as np


# from feature_extractor import FeatureExtractor


# fe = FeatureExtractor(1)
# featurized = fe.featurizeFiles('../data')
# classNames, repubAndDemMatrix, labels = featurized[:3]
[repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False)
k = 3
files=gf.getFileNames()
transformed = PCA.getPCAMat(repubAndDemMatrix, k)
repub=np.array([list(x) for i,x in enumerate(transformed) if labels[i]==1])
dem=np.array([list(x) for i,x in enumerate(transformed) if labels[i]==0])
plot.figure()
plot.scatter(repub[:,0],repub[:,1],c='r',marker='x')
plot.scatter(dem[:,0],dem[:,1],c='b',marker='x')
##plot.annotate(s=files[0],xy=transformed[0])
plot.savefig('results/images/VFPCA.png')
# plot.savefig('results/images/PCA.png')

'''
transformedWords=PCA.getPCAMat(repubAndDemMatrix.T, k)
vocab=vectorizerRepubDem.vocabulary_
indicesOfInterest=[]
f=open('wordsInterest.txt','r')
wordsOfInterest=[line.split()[0] for line in f]
示例#48
0
import PCA
import EXTRAS
import numpy

dataMat = PCA.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch13/testSet.txt")
lowDMat, reconMat = PCA.pca(dataMat, 1)
PCA.plotPCA(dataMat, reconMat)

"""
dataMat = PCA.replaceNanWithMean("E:/TestDatas/MachineLearningInAction/Ch13/secom.data")
meanVals = numpy.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVals
covMat = numpy.cov(meanRemoved, rowvar=0)
eigVals, eigVects = numpy.linalg.eig(numpy.mat(covMat))
print eigVals
"""

# EXTRAS.plotTestSet("E:/TestDatas/MachineLearningInAction/Ch13/testSet.txt")
# EXTRAS.plotTestSet3("E:/TestDatas/MachineLearningInAction/Ch13/testSet3.txt")
# EXTRAS.plotSecomPCA("E:/TestDatas/MachineLearningInAction/Ch13/secom.data")
示例#49
0
文件: main.py 项目: Wummer/MLExam
SGMean = PCA.MLmean(SGNorm)
SGCov = PCA.MLcov(SGNorm,SGMean)
eigw,eigv = np.linalg.eig(SGCov)


""" Python doesn't return an ordered list of eigenvalues/eigenvectors 
	so we join them and sort them in descending order.
	Then we substract the 2 highest eigenvectors/principal components """
SGVectors = []
for i in range(len(eigw)):
	SGVectors.append((eigw[i],eigv[:,i]))
SGVectors = sorted(SGVectors, reverse=True, key=lambda tup: tup[0])
SGPC = [SGVectors[0][1],SGVectors[1][1]]

#Projection via dot product
new_SGX,new_SGY = PCA.transform(SGNorm,SGPC)

# Plotting the eigenspectrum
plt.plot(range(1,len(eigw)+1),eigw,'r-')
plt.xlabel('Eigenvector number')
plt.ylabel('Eigenvalue')
plt.title('Eigenspectrum')
plt.show()

#Plotting the projection onto the first 2 Principal Components
plt.plot(new_SGX,new_SGY,"x")
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title("The SGdata projected onto Principal Components")
plt.show()
    # Load input data
    Obs_ij = np.loadtxt( INFILE_DIR + INFILE )
    Time_i  = np.arange( len( Obs_ij ) ) / ( 1.0 * len( Obs_ij ) )
    n_band = len( Obs_ij.T )

    # Initialization of Kernel
    print 'Decomposition into time slices...'
    n_slice = len( Time_i )
    Kernel_il = geometry.kernel( Time_i, n_slice, N_SIDE, GEOM )
    print 'Kernel_il', Kernel_il
    Kernel_il[ np.where( Kernel_il < 1e-3 ) ] = 0.
    print 'Kernel_il', Kernel_il

    # PCA
    print 'Performing PCA...'
    n_pc, V_nj, U_in, M_j = PCA.do_PCA( Obs_ij, E_cutoff=1e-2, output=True )

#    V_nj[0] = -1. * V_nj[0]
#    U_in.T[0] = -1. * U_in.T[0]
#    V_nj[1] = -1. * V_nj[1]
#    U_in.T[1] = -1. * U_in.T[1]

    n_type = n_pc + 1
    if n_type != 3 :
        print 'ERROR: This code is only applicable for 3 surface types!'
        sys.exit()

    U_iq = np.c_[ U_in, np.ones( len( U_in ) ) ]

    PC1_limit = [-0.4, 0.2] # manually set for now
    PC2_limit = [-0.1, 0.4] # manually set for now
示例#51
0
# Ignore the new feature as it messes up PCA
data_dict = pickle.load(open("data/own_data_dict.pkl", "r"))

features_list = getallFeatures(data_dict)
data = featureFormat(data_dict, features_list, sort_keys = True)

# Scale features:
mins = np.min(data, axis=0)
maxs = np.max(data, axis=0)
data = (data-mins)/(maxs-mins)

labels, features = targetFeatureSplit(data)

features_train, features_test, labels_train, labels_test = \
    stratifiedShuffleSplit(features, labels)

### Do some PCA
pca = PCA.doPCA(features_train, n = 4)
transformed_train = pca.transform(features_train)

# Do some hyperparam validation:
best_svc, svc_grid_scores = ClassifySVM.gridsearch(
    transformed_train, labels_train
)

svmfit = ClassifySVM.train(transformed_train, labels_train, best_svc)

test_classifier(svmfit, data)

dump_classifier_and_data(svmfit, data_dict, features_list)