def task(args):
    import pandas
    #data_set, = args
    logging.info("dataset = %s", data_set)
    # read the data sets
    logging.info("Reading data...")
    data = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    logging.info(" * gene expression shape: %d x %d" % data.shape)

    x = data.as_matrix()

    if normalize_data:
        # these shouldn't affect the results
        x -= np.mean(x)
        x /= np.std(x)
        x -= np.mean(x, axis=0)

    logging.info("Running PCA...")
    pca = sk_PCA()
    pca.fit(x)

    logging.info("Writing results...")
    res_dir = 'res/pca-explained-variance'
    res_filename = "%s/%s.txt" % (res_dir, data_set)
    ensure_dir_exists(res_dir)
    np.savetxt(res_filename, pca.explained_variance_ratio_)
예제 #2
0
def correct_tvn(df, embeds_cols, verbose=False):
    if verbose:
        print('Do TVN')

    dmso = df.loc[(df['compound'] == 'DMSO'), embeds_cols].to_numpy(copy=True)
    p = sk_PCA(n_components=len(embeds_cols), whiten=True).fit(dmso)
    df.loc[:, embeds_cols] = p.transform(df.loc[:, embeds_cols])
    return df
예제 #3
0
def assign_clusters(df_well, embeds_cols, min_cluster_size=10, min_samples=3):
    pca_image = sk_PCA(n_components=number_of_components_95(df_well, embeds_cols)).fit_transform(df_well[embeds_cols])
    tsne_image = sk_TSNE(metric='cosine', n_jobs=1).fit_transform(pca_image)
    clusterer = HDBSCAN(min_cluster_size=min_cluster_size, metric='manhattan', min_samples=min_samples).fit(tsne_image)

    # tsne_image = sk_TSNE(metric='cosine', n_jobs=1).fit_transform(df_well[embeds_cols])
    # clusterer = HDBSCAN(min_cluster_size=min_cluster_size, metric='manhattan', min_samples=min_samples).fit(tsne_image)
    return clusterer.labels_, clusterer.labels_.max(), tsne_image
예제 #4
0
    def PCA(self, X, cols, n_components):
        X = X.to_pandas()
        pca = sk_PCA(n_components=n_components)
        pca.fit(X[cols])
        X = pca.transform(X[cols])
        X = pd.DataFrame(X)
        print(
            f'PCA number of used components: {len(pca.explained_variance_ratio_)}'
        )
        features = list(np.arange(len(pca.explained_variance_ratio_)))

        return cudf.from_pandas(X), features
예제 #5
0
def number_of_components_95(df, embeds_cols):
    # PCA on all embeddings
    pca = sk_PCA().fit(df[embeds_cols])

    # Find the number of dimensions to explain 95% of variance
    i = 0
    s = 0
    for j in range(100):
        s += pca.explained_variance_ratio_[j]
        if s > 0.95:
            i = j
            break
    # There should be at least 8 dimensions        
    if i < 8:
        return 8
    else:
        return i
    def learn(self, x, y, log_file_prefix=None, callbacks=[]):
        # define optimizer and loss function
        self.encoder.compile(optimizer=self.optimizer, loss=self.loss)

        if (self.init_pca):
            # initialize weights of the first and last layer using PCA
            from sklearn.decomposition import PCA as sk_PCA
            #weights = self.encoder.layers[1].get_weights()
            weights = self.first_dense_enc_layer.get_weights()
            dim = weights[1].size
            w = sk_PCA(n_components=dim).fit(x).components_
            weights[0][:, :] = w.T
            weights[1][:] = -np.mean(np.dot(x, w.T), axis=0)
            #self.encoder.layers[1].set_weights(weights)
            self.first_dense_enc_layer.set_weights(weights)

        # optionally add callbacks
        keras_callbacks = []
        if log_file_prefix:
            #keras_callbacks.append(keras.callbacks.CSVLogger(log_file_prefix + ".log"))
            keras_callbacks.append(WeightLogger(self.encoder, log_file_prefix))
            keras_callbacks.append(LossLogger(log_file_prefix))
        for callback in callbacks:

            class CB(keras.callbacks.Callback):
                def on_train_begin(self, logs={}):
                    callback()

                def on_epoch_end(self, epoch, logs={}):
                    callback()

            keras_callbacks.append(CB())

        # train
        self.encoder.fit(x,
                         y,
                         nb_epoch=self.n_epochs,
                         batch_size=self.batch_size,
                         shuffle=True,
                         callbacks=keras_callbacks,
                         verbose=2)
예제 #7
0
def task(args):
  data_type, seed, (algName, _, makeAlg) = args
  logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName)
  # read the data sets
  logging.info("Reading data...")
  y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed))
  data_dim = x_train.shape[1]
  logging.info(" * training set: %d x %d" % x_train.shape)
  logging.info(" * testing set: %d x %d" % x_test.shape)
  # init rng  
  np.random.seed(seed)

  x_test = x_train

  logging.info("Running and evaluating the algorithm...")
  
  # init the algorithm
  alg = makeAlg(data_dim, repr_dim)
  
  # create output dir if does not exist
  ensure_dir_exists('res')

  from sklearn.decomposition import PCA as sk_PCA
  pca = sk_PCA(n_components=repr_dim)
  pca.fit(x_train)
  y_train = pca.transform(x_train)
  y_test = pca.transform(x_test)

  # define the progress saving function
  progress_filename = 'res/progress-enc-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  def save_progress():
    y_test_pred = alg.encode(x_test)
    rel_mse = relative_mean_squared_error(y_test, y_test_pred)
    progress_file.write("%g\n" % rel_mse)
  
  # fit
  alg.learn(x_train, y_train,
            log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)),
            callbacks=[save_progress])
예제 #8
0
    def learn(self,
              x,
              y=None,
              secondary_y=None,
              validation_split=0.0,
              validation_data=None,
              secondary_validation_data=None,
              log_file_prefix=None,
              per_epoch_callback_funs=[],
              callbacks=[]):

        # define optimizer and loss function
        if secondary_y is None:
            loss = self.params.loss
            loss_weights = None
        else:
            assert self.params.secondary_dims is not None
            loss = [self.params.loss, self.params.secondary_loss]
            loss_weights = [
                1 - self.params.secondary_loss_weight,
                self.params.secondary_loss_weight
            ]
        self.autoencoder.compile(optimizer=self.params.optimizer,
                                 loss=loss,
                                 loss_weights=loss_weights)

        if (self.params.init_pca):
            # initialize weights of the first and last layer using PCA
            from sklearn.decomposition import PCA as sk_PCA
            #weights = self.autoencoder.layers[1].get_weights()
            weights = self.first_dense_enc_layer.get_weights()
            dim = weights[1].size
            w = sk_PCA(n_components=dim).fit(x).components_
            weights[0][:, :] = w.T
            weights[1][:] = -np.mean(np.dot(x, w.T), axis=0)
            #self.autoencoder.layers[1].set_weights(weights)
            self.first_dense_enc_layer.set_weights(weights)
            #weights = self.autoencoder.layers[-1].get_weights()
            weights = self.last_dense_dec_layer.get_weights()
            weights[0][:, :] = w
            weights[1][:] = np.mean(x, axis=0)
            #self.autoencoder.layers[-1].set_weights(weights)
            self.last_dense_dec_layer.set_weights(weights)

        # possible validation data
        if validation_data is not None and y is None:
            if secondary_validation_data is not None:
                validation_data = (validation_data, [
                    validation_data, secondary_validation_data
                ])
            else:
                validation_data = (validation_data, validation_data)

        validation = (validation_data is not None or validation_split > 0)

        # by default predict the data itself
        if y is None:
            y = x

        if secondary_y is not None:
            y = [y, secondary_y]

        # optionally add callbacks
        keras_callbacks = []
        # 'built-in' callbacks
        if log_file_prefix:
            #keras_callbacks.append(keras.callbacks.CSVLogger(log_file_prefix + ".log"))
            if self.params.log_weights:
                keras_callbacks.append(
                    WeightLogger(self.autoencoder, log_file_prefix))
            if self.params.log_weights_diff_norm is not None:
                keras_callbacks.append(
                    WeightDiffStatLogger(self.autoencoder, log_file_prefix,
                                         self.params.log_weights_diff_norm))
            if self.params.log_loss:
                keras_callbacks.append(
                    LossLogger(log_file_prefix,
                               per_patch=self.log_loss_per_patch))
                if secondary_y is not None:
                    keras_callbacks.append(
                        LossLogger(log_file_prefix,
                                   loss='decoded_loss',
                                   per_patch=self.log_loss_per_patch))
                    keras_callbacks.append(
                        LossLogger(log_file_prefix,
                                   loss='secondary_loss',
                                   per_patch=self.log_loss_per_patch))
            if self.params.log_loss and validation:
                keras_callbacks.append(
                    LossLogger(log_file_prefix, loss='val_loss'))
                if secondary_y is not None:
                    keras_callbacks.append(
                        LossLogger(log_file_prefix, loss='val_decoded_loss'))
                    keras_callbacks.append(
                        LossLogger(log_file_prefix, loss='val_secondary_loss'))
        # externally defined keras callback objects
        for callback in callbacks:
            keras_callbacks.extend(callbacks)
        # externally defined callbacks functions
        for callback in per_epoch_callback_funs:

            class CB(keras.callbacks.Callback):
                def on_train_begin(self, logs={}):
                    callback()

                def on_epoch_end(self, epoch, logs={}):
                    callback()

            keras_callbacks.append(CB())

        if self.params.early_stopping:
            if self.params.early_stopping == True:
                monitor = ('val_loss' if validation else 'loss')
            else:
                monitor = self.params.early_stopping
            keras_callbacks.append(
                keras.callbacks.EarlyStopping(
                    monitor=monitor,
                    patience=self.params.early_stopping_patience))

        # optional pre train
        if self.params.pre_train is not None:
            method, params = self.params.pre_train
            if method == "pca":
                # fit encoder and decoder separately to PCA output
                pre_n_epochs, = params
                from sklearn.decomposition import PCA as sk_PCA
                y = sk_PCA(
                    n_components=self.params.output_dim).fit_transform(x)
                pretrain_combined = True
                if pretrain_combined:
                    # (pre)train both encoder and decoder at the same time
                    encoder_and_decoder = Model(
                        input=[self.input, self.encoded_input],
                        output=[self.encoded, self.decoded])
                    encoder_and_decoder.compile(
                        optimizer=self.params.optimizer, loss=self.params.loss)
                    pretrain_keras_callbacks = keras_callbacks.copy()
                    if log_file_prefix:
                        pretrain_keras_callbacks.append(
                            LossLogger(log_file_prefix, loss='encoded_loss'))
                        pretrain_keras_callbacks.append(
                            LossLogger(log_file_prefix, loss='decoded_loss'))
                    encoder_and_decoder.fit([x, y], [y, x],
                                            nb_epoch=pre_n_epochs,
                                            batch_size=self.params.batch_size,
                                            shuffle=True,
                                            callbacks=pretrain_keras_callbacks,
                                            verbose=2)
                else:
                    self.encoder.compile(optimizer=self.params.optimizer,
                                         loss=self.params.loss)
                    self.encoder.fit(x,
                                     y,
                                     nb_epoch=pre_n_epochs,
                                     batch_size=self.params.batch_size,
                                     shuffle=True,
                                     callbacks=keras_callbacks,
                                     verbose=2)
                    self.decoder.compile(optimizer=self.params.optimizer,
                                         loss=self.params.loss)
                    self.decoder.fit(y,
                                     x,
                                     nb_epoch=pre_n_epochs,
                                     batch_size=self.params.batch_size,
                                     shuffle=True,
                                     callbacks=keras_callbacks,
                                     verbose=2)
            else:
                raise ValueError("Invalid pre train method '%s'" % method)

        # train
        self.autoencoder.fit(x,
                             y,
                             nb_epoch=self.params.n_epochs,
                             batch_size=self.params.batch_size,
                             shuffle=True,
                             validation_split=validation_split,
                             validation_data=validation_data,
                             callbacks=keras_callbacks,
                             verbose=2)
예제 #9
0
 def init(self, input_dim, output_dim):
     self.pca = sk_PCA(n_components=output_dim)
     return self
예제 #10
0
def preprocess_features(npdata, n_components=16, method='PCA', n_jobs=1):
    """Preprocess an array of features.
    Args:
        npdata (np.array N * ndim): features to preprocess
        pca (int): dim of output
    Returns:
        np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
    """
    _, ndim = npdata.shape
    npdata = npdata.astype('float32')

    # Apply PCA-whitening with Faiss
    if method == 'PCA':
        mat = faiss.PCAMatrix(ndim, n_components, eigen_power=-0.5)
        mat.train(npdata)
        assert mat.is_trained
        npdata = mat.apply_py(npdata)
    # Apply UMAP for dimensionality reduction
    elif method == 'UMAP':
        fit = UMAP(n_components=n_components, metric='cosine')
        npdata = np.ascontiguousarray(fit.fit_transform(npdata))
    # Apply T-SNE for dimensionality reduction
    elif method == 'TSNE':
        if n_components > 3:
            X = sk_PCA().fit_transform(npdata)
            PCAinit = X[:, :n_components] / np.std(X[:, 0]) * 0.0001
            fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(npdata),
                                          dtype='float32')
        else:
            fit = sk_TSNE(n_components=n_components,
                          metric='cosine',
                          n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(npdata))
    # Apply adaptive T-SNE for dimensionality reduction
    elif method == 'AdaptiveTSNE':
        pca = sk_PCA().fit(npdata)

        # Find all the eigenvectors that explain 95% of the variance
        i = 0
        s = 0
        for j in range(len(pca.explained_variance_ratio_)):
            s += pca.explained_variance_ratio_[j]
            if s > 0.95:
                i = j
                # Prevent smaller than 8
                if i < 8:
                    i = 8
                break

        # Fit and transform the data with the number of components that explain 95%
        pca95_well = sk_PCA(n_components=i).fit_transform(npdata)

        # Do a similarity measure with TSNE on the pca data
        if n_components > 3:
            PCAinit = pca95_well[:, :n_components] / np.std(
                pca95_well[:, 0]) * 0.0001
            fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(pca95_well))
        else:
            fit = sk_TSNE(n_components=n_components,
                          metric='cosine',
                          n_jobs=n_jobs)
            npdata = np.ascontiguousarray(fit.fit_transform(pca95_well))

    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / row_sums[:, np.newaxis]
    return npdata
예제 #11
0
def main():
    data_size_fix_dim = [
        (100, 2),
        (500, 2),
        (1000, 2),
        (5000, 2),
        (10000, 2),
        (15000, 2),
        (50000, 2),
        (100000, 2),
        (150000, 2),
        (1000000, 2),
    ]
    data_size_fix_num = [
        (10000, 5),
        (10000, 10),
        (10000, 30),
        (10000, 50),
        (10000, 100),
        (10000, 150),
        (10000, 300),
        (10000, 500),
        # (10000, 1000),
        # (10000, 1500),
    ]
    """fix dim"""
    result = []
    for data_size in data_size_fix_dim:
        print(data_size)
        fake_data = np.random.rand(*data_size)

        res = {'sk_pca': [], 'my_pca': []}

        for _ in range(10):
            sk_pca = sk_PCA(n_components=fake_data.shape[1])
            sk_res = calc_process_time({'X': fake_data}, sk_pca.fit)
            res['sk_pca'].append(sk_res)

            my_pca = my_PCA(dim=data_size[1])
            mine_res = calc_process_time({'X': fake_data}, my_pca.fit)
            res['my_pca'].append(mine_res)

        result.append(res)

    aves = {'sk_pca': [], 'my_pca': []}
    for _, res in enumerate(result):
        print('-' * 10)
        for k, v in res.items():
            print(k, sum(v) / len(v))
            aves[k].append(sum(v) / len(v))
    print('-' * 10)

    x_ax = np.array(data_size_fix_dim)[:, 0]
    plt.plot(x_ax, aves['sk_pca'], marker="o", label='sklearn')
    plt.plot(x_ax, aves['my_pca'], marker="o", label='mine')
    plt.legend(loc=2)
    timestamp = datetime.now().strftime(TIME_TEMPLATE)
    plt.savefig('./result/bench-res-fix-dim{}.png'.format(timestamp))
    plt.clf()
    plt.close()
    """fix data num"""
    result = []
    for data_size in data_size_fix_num:
        print(data_size)
        fake_data = np.random.rand(*data_size)

        res = {'sk_pca': [], 'my_pca': []}

        for _ in range(10):
            sk_pca = sk_PCA(n_components=fake_data.shape[1])
            sk_res = calc_process_time({'X': fake_data}, sk_pca.fit)
            res['sk_pca'].append(sk_res)

            my_pca = my_PCA(dim=data_size[1])
            mine_res = calc_process_time({'X': fake_data}, my_pca.fit)
            res['my_pca'].append(mine_res)

        result.append(res)

    aves = {'sk_pca': [], 'my_pca': []}
    for _, res in enumerate(result):
        print('-' * 10)
        for k, v in res.items():
            print(k, sum(v) / len(v))
            aves[k].append(sum(v) / len(v))
    print('-' * 10)

    x_ax = np.array(data_size_fix_num)[:, 1]
    plt.plot(x_ax, aves['sk_pca'], marker="o", label='sklearn')
    plt.plot(x_ax, aves['my_pca'], marker="o", label='mine')
    plt.legend(loc=2)
    timestamp = datetime.now().strftime(TIME_TEMPLATE)
    plt.savefig('./result/bench-res-fix-num{}.png'.format(timestamp))
    plt.clf()
    plt.close()
예제 #12
0
        a = np.dot(mean_x.T, mean_x)
        #numpy.linalg.eig() 计算方形矩阵的特征值和特征向量
        # w,v = numpy.linalg.eig(a)  计算方形矩阵a的特征值和右特征向量
        # w: 多个特征值组成的一个矢量。备注:多个特征值并没有按特定的次序排列。特征值中可能包含复数。
        # v: 多个特征向量组成的一个矩阵。每一个特征向量都被归一化了。第i列的特征向量v[:,i]对应第i个特征值w[i]。
        w, v = np.linalg.eig(a)
        z = np.dot(mean_x, v)
        return z


if __name__ == '__main__':
    np.random.seed(12)
    np.set_printoptions(precision=6, suppress=True, linewidth=120)
    data = np.random.random((6, 5))
    python_PCA = PCA(n_components=5)
    sk_PCA_reduction = sk_PCA(n_components=0.95)
    sk_PCA_all = sk_PCA(n_components=5)
    x = np.array(data)
    sk_reduction_out = sk_PCA_reduction.fit_transform(x)
    sk_all_out = sk_PCA_all.fit_transform(x)

    python_svg_out = python_PCA.fit_transform(x)
    python_eig_out = python_PCA.eig_transform(x)

    print("sklearn_reduction")
    print(sk_reduction_out)
    print("sklearn_all")
    print(sk_all_out)
    print("vanilla_svg")
    print(python_svg_out)
    print("python_eig")
예제 #13
0
def f_PCA(n_comp=30, reuse=False):
    # LOADING TRAIN SET, Y IS DOWNLOADED FOR PLOTTING PURPOSE
    X_train = np.load('Data_files/X_train_processed.npy', mmap_mode='r')
    Y_train = np.load('Data_files/Y_train.npy', mmap_mode='r')
    if reuse:
        with open('Algos/PCA_folder/PCA.pkl', 'rb') as filehandler:
            pca = pickle.load(filehandler)

    else:
        # CREATES PCA WITH n_comp COMPONENTS TO KEEP
        # AND WHITENS THE DATA (NORMALIZATION)
        # NOTE THAT PCA AUTOMATICALLY CENTERS THE DATA
        pca = sk_PCA(n_components=n_comp, whiten=True)
        X_train = np.load('Data_files/X_train_processed.npy', mmap_mode='r')
        pca.fit(X_train)
    # EXTRACTING THE VARIANCES AND THE EIGENSPECTRA FOR VISUALIZATION
    variances = pca.explained_variance_ratio_
    eig_vec = pca.components_

    # LOADING THE WAVELENGTHS FOR PLOTTING
    wavelengths = np.load('Data_files/wavelengths.npy', mmap_mode='r')
    fig = plt.figure()
    # MEANINGFUL EIGENVECTORS/COMPONENTS SELECTED IN 1-BASED
    indexs_list = [8, 11, 20, 26, 30]
    for i in indexs_list:
        plt.plot(wavelengths, eig_vec[i - 1], label='e' + str(i))
    plt.legend(fontsize=18)
    plt.grid()
    plt.xlabel('Wavelengths', fontsize=20)
    plt.ylabel('Intensity', fontsize=20)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    # plt.savefig('Images/PCA_first_components.eps',
    #             format="eps", bbox_inches='tight')
    plt.show()
    del wavelengths

    # DICTIONARY CREATION TO ASSOCIATE TARGET TO COLOR
    dic = {}
    dic['0'] = 'b'
    dic['1'] = 'g'
    dic['2'] = 'r'
    dic['3'] = 'k'
    dic['4'] = 'c'
    coords = pca.transform(X_train)
    del X_train
    labels = ['QSO _', 'QSO BAL', 'GALAXY LRG', 'GALAXY ELG', 'STAR']

    # THE FOLLOWING FIGURES PLOT THE DISTRIBUTION OF LABELS
    # ON THE SELECTED COMPONENTS
    fig = plt.figure()
    for i in np.unique(Y_train):
        plt.plot(coords[Y_train == i][19],
                 coords[Y_train == i][25],
                 dic[str(int(i))] + '.',
                 label=labels[int(i)],
                 markersize=7)
    plt.legend(fontsize=18)
    plt.grid()
    plt.xlabel('e20', fontsize=20)
    plt.ylabel('e26', fontsize=20)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.savefig('Images/PCA_distribution2D_12.eps',
                format="eps",
                bbox_inches='tight')
    plt.show()

    fig = plt.figure()
    for i in np.unique(Y_train):
        plt.plot(coords[Y_train == i][19],
                 coords[Y_train == i][29],
                 dic[str(int(i))] + '.',
                 label=labels[int(i)],
                 markersize=7)
    plt.legend(fontsize=18)
    plt.grid()
    plt.xlabel('e20', fontsize=20)
    plt.ylabel('e30', fontsize=20)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.savefig('Images/PCA_distribution2D_13.eps',
                format="eps",
                bbox_inches='tight')
    plt.show()

    fig = plt.figure()
    for i in np.unique(Y_train):
        plt.plot(coords[Y_train == i][25],
                 coords[Y_train == i][29],
                 dic[str(int(i))] + '.',
                 label=labels[int(i)],
                 markersize=7)
    plt.legend(fontsize=18)
    plt.grid()
    plt.xlabel('e26', fontsize=20)
    plt.ylabel('e30', fontsize=20)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.savefig('Images/PCA_distribution2D_23.eps',
                format="eps",
                bbox_inches='tight')
    plt.show()

    # SAVES THE PCA OBJECT
    with open('Algos/PCA_folder/PCA.pkl', 'wb') as filehandler:
        pickle.dump(pca, filehandler)

    return pca, variances, eig_vec