예제 #1
0
    def pca_visualize(self, save_dir):
        assert(save_dir.endswith('/'))
        util.make_sure_path_exists(self.DATA_VIZ_DIR + save_dir)
        save_name = self.name.replace(' ', '_').lower()

        errors = {'train':[], 'val':[], 'test':[]}
        xdata = [self.get_train_x(), self.get_val_x(), self.get_test_x()]

        cum_variances = []
        indiv_variances = []
        x = range(1, min(self.get_num_features(), 101)) # only do first 100 feats
        for i in x:
            model = pca.PCA(n_components=i)  # get all components
            model.fit(self.get_train_x())

            for key, xset in zip(errors.keys(), xdata):
                recon = model.inverse_transform(model.transform(xset))
                errors[key].append(util.mse(xset, recon))

            indiv_variances.append(model.explained_variance_ratio_[-1])
            cum_variances.append(np.sum(model.explained_variance_ratio_))

        f, axarr = plt.subplots(2, sharex=True, figsize=(16, 9.6))
        axarr[0].plot(x, errors['train'], 'b:', label=u'Train Reconstruction MSE')
        axarr[0].plot(x, errors['val'], 'g:', label=u'Val Reconstruction MSE')
        axarr[0].plot(x, errors['test'], 'r:', label=u'Test Reconstruction MSE')
        axarr[1].plot(x, indiv_variances, 'b--', label=u'Variance Explained by Each Component')
        axarr[1].plot(x, 1 - np.array(cum_variances), 'b:', label=u'1 Minus Cumulative Explained Variance')
        axarr[1].set_xlabel(u'Dimensionality of PCA Representation')
        axarr[0].set_title(u'Analysis of PCA over Error and Variance')
        axarr[0].set_ylabel(u'MSE')
        axarr[1].set_ylabel(u'Proportion of Explained Variance')
        axarr[0].legend()
        axarr[1].legend()
        f.savefig(self.DATA_VIZ_DIR + save_dir + save_name + '_pca.png', bbox_inches='tight')
예제 #2
0
 def plot_mixture_pca(self, matrix_dir: str):
     """PCA of mixture samples and the tissues used to generate them"""
     os.listdir(matrix_dir)
     f, ax = plt.subplots(5, 2, figsize=(8, 4 * 5))
     ax = ax.flatten()
     for i, matrix in enumerate(os.listdir(matrix_dir)):
         t1, t2 = os.path.splitext(matrix)[0].split("-")
         mix_df = pd.read_hdf(os.path.join(matrix_dir, matrix))
         mix_df["tissue"] = "Mixture"
         sub = self.df[self.df.tissue.isin([t1, t2])]
         pca_df = pd.concat([sub, mix_df]).dropna(axis=1)
         embedding = pca.PCA(n_components=2).fit_transform(
             pca_df[self.genes])
         embedding = pd.DataFrame(embedding)
         embedding.columns = ["PCA1", "PCA2"]
         embedding["tissue"] = list(pca_df["tissue"])
         sns.scatterplot(
             data=embedding,
             x="PCA1",
             y="PCA2",
             hue="tissue",
             style="tissue",
             ax=ax[i],
         )
         ax[i].set_title(f"{t1}-{t2}")
     plt.tight_layout()
     return ax
def plot_pca(dat,
             colour_subgroups,
             p=None,
             components=(0, 1),
             marker_subgroups=None,
             ax=None,
             colour_map=None,
             marker_map=None,
             **kwargs):
    if p is None:
        p = pca.PCA()
        pca_data = p.fit_transform(dat.transpose())
    else:
        pca_data = p.transform(dat.transpose())
    variance_explained = p.explained_variance_ratio_ * 100.

    ax = scatter.scatter_with_colour_and_markers(
        pca_data[:, components],
        colour_subgroups=colour_subgroups,
        colour_map=colour_map,
        marker_subgroups=marker_subgroups,
        marker_map=marker_map,
        ax=ax,
        **kwargs)

    ax.set_xlabel("PCA component %s (%.1f%%)" %
                  (components[0] + 1, variance_explained[components[0]]))
    ax.set_ylabel("PCA component %s (%.1f%%)" %
                  (components[1] + 1, variance_explained[components[1]]))

    return p, ax
예제 #4
0
def PCA_2d_example():
    '''加载数据并作图'''
    data = spio.loadmat('data/data.mat')
    X = data['X']
    plt = plot_data_2d(X,'bo')
    plt.axis('square')
    plt.title('original data')
    plt.show()
    '''归一化数据并作图'''
    scaler = StandardScaler()
    scaler.fit(X)
    x_train = scaler.transform(X)
    
    plot_data_2d(x_train, 'bo')
    plt.axis('square')
    plt.title('scaler data')
    plt.show()
    
    '''拟合数据'''
    K=1 # 要降的维度
    model = pca.PCA(n_components=K).fit(x_train)   # 拟合数据,n_components定义要降的维度
    Z = model.transform(x_train)    # transform就会执行降维操作
    
    '''数据恢复并作图'''
    Ureduce = model.components_     # 得到降维用的Ureduce
    x_rec = np.dot(Z,Ureduce)       # 数据恢复
    
    plot_data_2d(x_rec,'bo')
    plt.plot()
    plt.axis('square')
    plt.title('recover data')
    plt.show()
예제 #5
0
def main():
    iris = load_iris()
    data = scale(iris.data)
    n_samples, n_features = data.shape
    reduced_data = pca.PCA(n_components=2).fit_transform(data)
    c, centroids = mykmeans(reduced_data, 3, 50)
    plotResult(c, centroids, reduced_data, nNumCluster=3)
예제 #6
0
    def svm_proba(train_xx6, train_y, test_xx6):
        selects = []
        for j, y in enumerate(train_y):
            if y == 1:
                [selects.append(j - e) for e in [-2, -1, 0, 1, 2]]
        train_xx6 = train_xx6[selects]
        train_xx6 = train_xx6[:, :, 40:80]
        test_xx6 = test_xx6[:, :, 40:80]
        train_y = train_y[selects]
        train_y[train_y != 1] = 0

        # s = train_xx6.shape
        # train_xx6 = train_xx6.reshape((s[0], s[1] * s[2]))
        # s = test_xx6.shape
        # test_xx6 = test_xx6.reshape((s[0], s[1] * s[2]))

        # return test_xx6

        clf = make_pipeline(
            Vectorizer(), StandardScaler(), pca.PCA(n_components=.95),
            svm.SVC(gamma='scale',
                    kernel='rbf',
                    class_weight={
                        0: 1,
                        1: 2
                    },
                    probability=True))
        clf.fit(train_xx6, train_y)
        return clf.predict_proba(test_xx6)
예제 #7
0
def PCA_Data():
    train_data = pd.read_csv('data/zhengqi_train.txt', '\t')
    train_data = np.array(train_data)
    value = train_data[:, 0:-1]
    pcas = pca.PCA(n_components=0.95)  # 0.95
    pcas.fit(value)
    pca_value = pcas.transform(value)
    return pcas, len(pca_value[0])
예제 #8
0
def do_pca(k=20):
    vec = []
    for word in words.index:
        vec.append(find_word_vec(word))
    pca_model = pca.PCA(n_components=k).fit(vec)
    Z = pca_model.transform(vec)
    Z = pd.DataFrame(Z, index=words.index)
    return Z
예제 #9
0
def loadDataSet():
    '''loading data......'''
    data = dataSet.load_iris()
    dataset = data.data
    target = data.target
    PCA = pca.PCA(n_components=2)
    dataset = PCA.fit_transform(dataset)
    return np.mat(dataset), np.mat(target)
    pass
예제 #10
0
def read_data():
    train_data = pd.read_csv('data/zhengqi_train.txt', '\t')
    train_data = np.array(train_data)
    value = train_data[:, 0:-1]
    target = train_data[:, -1:]
    pcas = pca.PCA(n_components=0.95)  # 0.95
    pcas.fit(value)
    X_pca = pcas.transform(value)
    return X_pca, target
예제 #11
0
def visulizeCentroid(centroid):
    PCA = pca.PCA(n_components=2).fit(centroid)
    embedding = PCA.transform(centroid)
    for cls in range(num_classes):
        plt.scatter(embedding[cls][0],
                    embedding[cls][1],
                    marker='x',
                    color=colors[cls],
                    s=12)
    plt.savefig('icnn_centroid.png')
예제 #12
0
def decom_pca(data,dim):
    # scaler = StandardScaler()
    # scaler.fit(data)
    # data = scaler.transform(data)

    model = pca.PCA(n_components=dim).fit(data)
    data_trans = model.transform(data)

    print(data_trans.shape)
    return data_trans
    def __init__(self, provider, num_principle_components):
        """
        This class wraps an data provider and performs dimensionality reduction using PCA
        """
        self._provider = provider
        self._pca_transform = pca.PCA(n_components=num_principle_components, svd_solver="full")
        self._pca_transform.fit(provider.get_training_data())

        self._X = self._pca_transform.transform(provider.get_training_data())

        return
예제 #14
0
def fit_PCA(C, n_components=5, **kwargs):
    mdl = skpca.PCA(n_components=n_components, **kwargs)
    M = mdl.fit_transform(C)

    resDict = {
        'model': mdl,
        'train_data': C,
        'trans_data': M,
    }

    return pyutil.util_obj(**resDict)
예제 #15
0
    def feature_extraction(self):
        mat = list()
        for t in self.articles:
            mat.append(t.tfidf)

        # Feature extraction using PCA
        if self.PCA_feature_extraction:
            pca1 = pca.PCA(n_components=len(mat))
            pca1.fit(mat)
            mat = pca1.components_
            mat = [x[0:10] for x in mat]

        return mat
예제 #16
0
def nReadAll():
    from sklearn.decomposition import pca
    train_df = pd.read_csv('../Data/zhengqi_train.txt', sep='\t')
    train_x, train_y = train_df.iloc[:, 0:38], train_df.iloc[:, [38]]
    test_x = pd.read_csv('../Data/zhengqi_test.txt', sep='\t')
    x = train_x.append(test_x)

    # PCA数据处理
    pca = pca.PCA(n_components=0.95)  # 0.95
    pca.fit(x)
    x_pca = pca.transform(x)
    ntrain_x, ntest_x, ntrain_y = x_pca[:2888, :], x_pca[2888:, :], train_y

    return ntrain_x, ntest_x, ntrain_y
예제 #17
0
def get_hog_features():
    data = sio.loadmat(
        '/home/hardik/Desktop/MTech_Project/Data/HOG_Feature_Data/natural_movies_hog_features.mat'
    )
    features = data['hog_features']
    labels = data['hog_labels'].flatten()
    labels = sess.run(tf.one_hot(labels, depth=4))
    print(labels.shape)

    # Normalizing The Data
    from sklearn.preprocessing import Normalizer
    normalized_features = Normalizer().fit(features).transform(features)

    # Dimension Reduction
    pcaModel = pca.PCA(n_components=50)
    pca_features = pcaModel.fit_transform(normalized_features)

    return {'features': pca_features, 'labels': labels}
예제 #18
0
def pre_xdata(xdata, k=1):
    scaler = StandardScaler()
    print('xdata', xdata.shape)
    xdata = np.squeeze(xdata)
    print('xdata', xdata.shape)
    scaler.fit(xdata)
    x_train = scaler.transform(xdata)
    model = pca.PCA(n_components=k).fit(x_train)   # 拟合数据,n_components定义要降的维度
#     print('model', model)
    Z = model.transform(x_train)    # transform就会执行降维操作
    print('Z', Z.shape)
#     Ureduce = model.components_     # 得到降维用的Ureduce
#     print('Ureduce', Ureduce.shape)
#     x_rec = np.dot(Z,Ureduce)       # 数据恢复
#     print('x_rec', x_rec.shape)
    Z = Z[:, np.newaxis,:]
    print('Z', Z.shape)
    return Z
예제 #19
0
    def PCA_face_example(self):
        '''加载数据并显示'''
        image_data = spio.loadmat('data_faces.mat')
        X = image_data['X']
        self.display_imageData(X[0:100, :])  # 显示100个最初图像
        '''归一化数据'''
        scaler = StandardScaler()
        scaler.fit(X)
        x_train = scaler.transform(X)
        '''拟合模型'''
        K = 100
        model = pca.PCA(n_components=K).fit(x_train)
        Z = model.transform(x_train)
        Ureduce = model.components_

        self.display_imageData(Ureduce[0:36, :])  # 可视化部分U数据
        x_rec = np.dot(Z, Ureduce)

        self.display_imageData(x_rec[0:100, :])  # 显示恢复的数据
예제 #20
0
    def plot_pca_nearby_tissues(self, matrix_path: str, tissues):
        st_df = pd.read_hdf(matrix_path)
        label = os.path.splitext(os.path.basename(matrix_path))[0]
        st_df["tissue"] = "Mixture"
        sub = self.df[self.df.tissue.isin(tissues)]
        pca_df = pd.concat([st_df, sub]).dropna(axis=1)
        embedding = pca.PCA(n_components=2).fit_transform(pca_df[self.genes])
        embedding = pd.DataFrame(embedding)
        embedding.columns = ["PCA1", "PCA2"]
        embedding["tissue"] = list(pca_df["tissue"])

        f, ax = plt.subplots(figsize=(8, 8))
        sns.scatterplot(data=embedding,
                        x="PCA1",
                        y="PCA2",
                        hue="tissue",
                        style="tissue")
        plt.title(f"PCA of {label} Mixtures and Nearby Tissues")
        return ax
예제 #21
0
    def visualize_data(self, save_dir):
        assert(save_dir.endswith('/'))
        util.make_sure_path_exists(self.DATA_VIZ_DIR + save_dir)

        compressor = pca.PCA(n_components=2)
        compressor.fit(self.get_train_x())
        xtr = compressor.transform(self.get_train_x())

        minx1 = min(xtr[:,0])
        maxx1 = max(xtr[:,0])
        minx2 = min(xtr[:,1])
        maxx2 = max(xtr[:,1])

        # give 15% leeway for display
        prop = 0.10
        axes = [minx1 - (prop * abs(minx1)), maxx1 + (prop * maxx1),
                minx2 - (prop * abs(minx2)), maxx2 + (prop * maxx2)]

        for x, y, set_name in self.iter_all_data():
            self._visualize(compressor.transform(x), y, set_name, axes, save_dir)
 def whitting(self, X, model=0):
     """
     almost same as it in org code use PCA an whitting
     use sklearn PCA method to reduce the dim which bigger than input channel_num
     :param X: the recv mat of audio shape = [f, t, n_channels]
     :param model: the option for pca or turely whitting
     :return: The num of channels in the josn data which shape = [f, t, num_source]
     """
     if not model:
         local_pca = pca.PCA(n_components=self.n_source,
                             svd_solver="full",
                             whiten=True)
         res = []
         angle = np.angle(X)
         X = np.abs(X)
         for i in range(X.shape[1]):
             res.append(local_pca.fit_transform(X[:, i, :]))
         res = np.asarray(res).astype(np.complex)
         res = np.transpose(res, [1, 0, 2])
         res *= np.exp(1j * angle)
         return res
     elif model == 1:
         dnum = self.n_source
         [I, J, M] = X.shape
         Y = np.zeros([I, J, self.n_source], dtype=np.complex)
         for i in range(I):
             Xi = np.squeeze(X[i, :, :]).T
             V = np.matmul(Xi, Xi.T.conj()) / J
             [D, P] = np.linalg.eig(V)
             idx = np.argsort(D)
             D = D[idx]
             P = P[:, idx]
             D = np.diag(D)
             D2 = D[M - dnum:M, M - dnum:M]
             P2 = P[:, M - dnum:M]
             D2_ = np.linalg.pinv(np.sqrt(D2))
             Y[i, :, :] = np.matmul(np.matmul(D2_, P2.conj().T), Xi).T
         return Y
     else:
         raise ValueError("model should in 0 1")
예제 #23
0
 def train_and_save_svm_model(self, paths, _labels, model_filename):
     svm_clf = SVC(C=1e3)
     vectors = []
     labels = []
     for i, (path, label) in enumerate(zip(paths, _labels)):
         if i % 10 == 1:
             print('SVM Train Parsing:', i)
         _, faces = self.read_image(path)
         if len(faces) == 0:
             print('Face Align Failed:', path)
             continue
         _, landmarks = faces[0]
         vectors.append(self.get_distance_vector(landmarks))
         labels.append(label)
     pca_model = None
     if USE_PCA:
         pca_model = pca.PCA(n_components=PCA_DIMENSIONS)
         pca_model.fit(vectors)
         vectors = pca_model.transform(vectors)
     svm_clf.fit(vectors, labels)
     with open(model_filename, 'wb') as model:
         pickle.dump({"svm_clf": svm_clf, "pca_model": pca_model}, model)
예제 #24
0
    def plot_pca_nearby_tissues(self, background_path: str, tissues,
                                tumor_tissue):
        df = pd.read_hdf(background_path)
        tumor = self.tumor
        tumor = tumor[tumor.tissue == tumor_tissue]
        tumor["tissue"] = f"{tumor_tissue}-Tumor"

        sub = df[df.tissue.isin(tissues)]
        pca_df = pd.concat([tumor, sub]).dropna(axis=1)
        embedding = pca.PCA(n_components=2).fit_transform(pca_df[self.genes])
        embedding = pd.DataFrame(embedding)
        embedding.columns = ["PCA1", "PCA2"]
        embedding["tissue"] = list(pca_df["tissue"])

        f, ax = plt.subplots(figsize=(8, 8))
        sns.scatterplot(data=embedding,
                        x="PCA1",
                        y="PCA2",
                        hue="tissue",
                        style="tissue")
        plt.title(f"PCA of {tumor_tissue} and Nearby GTEx Tissues")
        return ax
예제 #25
0
    # cell_line[cell_line.index.str.contains('p62')] = 'ICb1299'
    # meta.insert(0, 'cell_line', cell_line)

    anno = loader.load_illumina_methylationepic_annotation()

    me_data = obj.data.dropna()
    me_data = process.m_from_beta(me_data)

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)

    anno = anno.loc[common_probes]

    # plot PCA

    p = pca.PCA()
    pca_dat = p.fit_transform(me_data.transpose())

    fig = plt.figure()
    ax = fig.add_subplot(111)

    marker_groups = meta.cell_line
    marker_dict = dict([
        ('3021', 'o'),
        ('ICb1299', 's'),
    ])

    colour_groups = meta.batch.copy()
    colour_dict = dict([
        ('2017-09-19', '#7fc97f'),
        ('2018-01-12', '#beaed4'),
if __name__ == '__main__':

    def generate_data(num_samples=100):
        # The desired mean values of the sample.
        mu = np.array([0.0, 0.0, 0.0])

        # The desired covariance matrix.
        r = np.array([[1.50, 1.25, 1.25], [1.25, 1.50, 1.25],
                      [1.25, 1.25, 1.50]])

        # Generate the random samples.
        y = np.random.multivariate_normal(mu, r, size=num_samples)

        x = y[:, 0:2].T
        y = y[:, 2].T

        return x, y

    x, y = generate_data()

    pca = PrincipalComponentAnalysis(2)
    print(x.shape)
    z = pca.fit_transform(x)
    pca.plot_2d(x)
    plt.scatter(z[0], z[1])

    from sklearn.decomposition import pca
    z = pca.PCA(2).fit_transform(x.T)
    plt.scatter(z[:, 0], z[:, 1])
    plt.show()
예제 #27
0
train_data_missing = (df.isnull().sum() / len(df)) * 100
print(train_data_missing)

# 分析特性与目标值的相关性 热力图
corrmat = df.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=.8, annot=True)
plt.show()

# 对训练数据处理,分离出特征和标签
X = df.values[:, 0:-1]
Y = df.values[:, -1]
X1_test = df_test.values

# PCA数据处理
pca = pca.PCA(n_components=0.95)
pca.fit(X)
X_pca = pca.transform(X)
X1_pca = pca.transform(X1_test)
'''模型训练'''
# 分离出训练集和测试集,并用梯度提升回归训练
X_train, X_test, Y_train, Y_test = train_test_split(X_pca,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=40)

myGBR = GradientBoostingRegressor(alpha=0.9,
                                  criterion='friedman_mse',
                                  init=None,
                                  learning_rate=0.03,
                                  loss='huber',
예제 #28
0
                                                                     g_ang)
                path = path1
                if not os.path.exists(path):
                    continue
                if g_ang == '090':
                    img = cv2.imread(path, 0)
                else:
                    img = cv2.imread(path1, 0)
                img = cv2.resize(img, (64, 64))
                img = img.flatten().astype(np.float32)
                X.append(img)
                y.append(p - 63)
                # y.append(p)
        X = np.asarray(X)
        y = np.asarray(y).astype(np.int32)
        pca_model = pca.PCA(n_components=int(min(X.shape) * 0.2), whiten=False)
        # print(int(min(X.shape)*0.20))
        pca_model.fit(X)
        X = pca_model.transform(X)

        lda_model = LinearDiscriminantAnalysis(n_components=45)
        lda_model.fit(X, y)
        X = lda_model.transform(X)
        nbrs = KNeighborsClassifier(n_neighbors=1,
                                    p=2,
                                    weights='distance',
                                    metric='euclidean')
        nbrs.fit(X, y)

        testX = []
        testy = []
예제 #29
0
    w1 = 1 / (c1 / (c0 + c1 + c2))
    w2 = 1 / (c2 / (c0 + c1 + c2))

    clf = RandomForestClassifier(
        n_estimators=1000,
        min_samples_leaf=10,
        n_jobs=-1,
        class_weight={
            0: w0,
            1: w1,
            2: w2
        },
        #oob_score=True,
    )

    reducedFeatureVector = pca.PCA(n_components=64)
    reducedFeatureVector.fit(X.values[learningStartIdx:learningEndIdx])

    clf = clf.fit(
        reducedFeatureVector.transform(
            X.values[learningStartIdx:learningEndIdx]),
        Y.values[learningStartIdx:learningEndIdx])
    Y_Pred = clf.predict(
        reducedFeatureVector.transform(
            X.values[learningStartIdx:learningEndIdx]))
    print("Training Performance")
    print(
        classification_report(Y.values[learningStartIdx:learningEndIdx],
                              Y_Pred,
                              labels=[0, 1, 2],
                              target_names=['down', 'flat', 'up']))
        X.append(X1_Data_Norm[i, j, :])
        Y.append(Y1[[i], [j]])
X = np.array(X)
Y = np.array(Y)
X = np.nan_to_num(X)

#np.save("new_result2/Gabor_norm.npy",X)

print('PCA')
scaler = StandardScaler()
scaler.fit(X)
print(np.shape(X))
x_train = scaler.transform(X)

K = 250
model = pca.PCA(n_components=K).fit(x_train)
Z = model.transform(x_train)
print(np.shape(Z))
np.save('new_result2/Gabor_norm_pca.npy', Z)
print(np.sum(Y))

kf = KFold(n_splits=10)

li = []
for i in range(8700):
    li.append(i)
random.shuffle(li)
final_result = []
for train_n, test_n in kf.split(Z):
    X_train1 = []
    X_test1 = []