Exemplo n.º 1
class UoINMF(BaseEstimator, TransformerMixin):

    def __init__(self, n_bootstraps=10,
                 random_state=None, cons_meth=None,
                 ranks=None, nmf=None, dbscan=None, nnreg=None):
        Union of Intersections Nonnegative Matrix Factorization

        n_bootstraps, int
            number of bootstraps to use for model selection
        ranks int or list or None, default None
            the range of k to use. if *ranks* is an int,
            range(2, ranks+1) will be used. If not specified,
            range(X.shape[1]) will be used.
            the NMF class to use. Note: this class must take
            argument *n_components* as an argument
            DBSCAN object to use. By default use sklearn.cluster.DBSCAN
            with MinPts=3 and epsilon=0.2
            Non-negative regressor to use. default uses scipy.optimize.nnls
            method for computing consensus bases after clustering,
            uses np.median


    def set_params(self, **kwargs):

    def __initialize(self, **kwargs):
        n_bootstraps = kwargs['n_bootstraps']
        ranks = kwargs['ranks']
        nmf = kwargs['nmf']
        dbscan = kwargs['dbscan']
        random_state = kwargs['random_state']
        nnreg = kwargs['nnreg']
        cons_meth = kwargs['cons_meth']
        self.n_bootstraps = n_bootstraps
        self.components_ = None
        if ranks is not None:
            if isinstance(ranks, int):
                self.ranks = list(range(2, ranks + 1)) \
                    if isinstance(ranks, int) else list(ranks)
            elif isinstance(ranks, (list, tuple, range, np.array)):
                self.ranks = tuple(ranks)
                raise ValueError('specify a max value or an array-like for k')
        if nmf is not None:
            if isinstance(nmf, type):
                raise ValueError('nmf must be an instance, not a class')
            self.nmf = nmf
            self.nmf = NMF(beta_loss='kullback-leibler', solver='mu',
                           max_iter=400, init='random')
        if dbscan is not None:
            if isinstance(dbscan, type):
                raise ValueError('dbscan must be an instance, not a class')
            self.dbscan = dbscan
            self.dbscan = DBSCAN(min_samples=self.n_bootstraps / 2)

        if random_state is None:
            self._rand = np.random
            if isinstance(random_state, int):
                self._rand = np.random.RandomState(random_state)
            elif isinstance(random_state, np.random.RandomState):
                self._rand = random_state
        if cons_meth is None:
            # the method for computing consensus H bases after clustering
            self.cons_meth = np.median
            self.cons_meth = cons_meth

        if nnreg is None:
            self.nnreg = lambda A, b: spo.nnls(A, b)[0]
            if isinstance(nnreg, ):
                self.nnreg = lambda A, B: nnreg.fit(A, B).coef_
                raise ValueError("unrecognized regressor")

        self.components_ = None
        self.bases_samples_ = None
        self.bases_samples_labels_ = None
        self.boostraps_ = None

    def fit(self, X, y=None):
        Perform first phase of UoI NMF decomposition.

        Compute H matrix.

        Iterate across a range of k (as specified with the *ranks* argument).

            X:  array of shape (n_samples, n_features)
            y:  ignored
        check_non_negative(X, 'UoINMF')
        n, p = X.shape
        Wall = list()
        k_tot = sum(self.ranks)
        n_H_samples = k_tot * self.n_bootstraps
        H_samples = np.zeros((n_H_samples, p), dtype=np.float64)
        ridx = list()
        rep_idx = self._rand.randint(n, size=(self.n_bootstraps, n))
        for i in range(self.n_bootstraps):
            # compute NMF bases for k across bootstrap replicates
            H_i = i * k_tot
            sample = X[rep_idx[i]]
            for (k_idx, k) in enumerate(self.ranks):
                # concatenate k by p
                H_samples[H_i:H_i + k:, ] = (self.nmf.set_params(n_components=k)
                H_i += k
        # remove zero bases
        H_samples = H_samples[np.sum(H_samples, axis=1) != 0.0]
        # normalize by 2-norm
        # TODO: double check normalizing across correct axis
        H_samples = normalize(H_samples)

        # cluster all bases
        labels = self.dbscan.fit_predict(H_samples)

        # compute consensus bases from clusters
        # TODO: check if we need to filter out -1
        cluster_ids = np.unique([x for x in labels if x != -1])
        nclusters = len(cluster_ids)
        H_cons = np.zeros((nclusters, p), dtype=np.float64)
        for i in cluster_ids:
            H_cons[i, :] = self.cons_meth(H_samples[labels == i], axis=0)
        # remove nans
        # TODO: check if we need to remove NaNs
        # H_cons = H_cons[np.any(np.isnan(H_cons), axis=1),]
        # normalize by 2-norm
        H_cons = normalize(H_cons)
        self.components_ = H_cons
        self.bases_samples_ = H_samples
        self.bases_samples_labels_ = labels
        self.boostraps_ = rep_idx
        self.reconstruction_err_ = None
        return self

    def transform(self, X, reconstruction_err=True):
        Transform the data X according to the fitted UoI-NMF model

            X : array-like, shape (n_samples, n_features)
            reconstruction_err: boolean
                True to compute reconstruction error, False otherwise.
                default True.
        if self.components_ is None:
            raise ValueError('UoINMF not fit')
        if X.shape[1] != self.components_.shape[1]:
            raise ValueError(
                'incompatible shape: cannot reconstruct with %s and %s'
                % (X.shape, self.components_.shape))
        H_t = self.components_.T
        ret = np.zeros((X.shape[0], self.components_.shape[0]), dtype=X.dtype)
        for i in range(X.shape[0]):
            ret[i] = self.nnreg(H_t, X[i])
        if reconstruction_err:
            self.reconstruction_err_ = np.linalg.norm(
                X - self.inverse_transform(ret))
        return ret

    def fit_transform(self, X, y=None, reconstruction_err=True):
        Transform the data X according to the fitted UoI-NMF model

            X : array-like; shape (n_samples, n_features)
            reconstruction_err : bool
                True to compute reconstruction error, False otherwise.
                default True.
            W : array-like; shape (n_samples, n_components)
                Transformed data.
        return self.transform(X, reconstruction_err=reconstruction_err)

    def inverse_transform(self, W):
        Transform data back to its original space.

            W : array-like; shape (n_samples, n_components)
                Transformed data matrix.
            X : array-like; shape (n_samples, n_features)
                Data matrix of original shape...
        if self.components_ is None:
            raise ValueError('UoINMF not fit')
        if W.shape[1] != self.components_.shape[0]:
            raise ValueError(
                'incompatible shape: cannot multiply %s with %s'
                % (W.shape, self.components_.shape))
        return np.matmul(W, self.components_)
Exemplo n.º 2
def dimensionality_reduction(TrainFeatures, TestFeatures, Method, params):
    """ It performs dimensionality reduction of a training and a test features matrix
            stored in a .h5 file each.
            It's possible to use 5 different methods for dimensionality reduction.

                - TrainFeatures: string
                        It is the path of an .h5 file of the training features.
                        It contains at least the following datasets:
                        - 'feats':   array-like, shape (n_samples, n_features)
                        - 'labels':  array-like, shape (n_samples, )
                        - 'img_ids': array-like, shape (n_samples, )
                - TestFeatures: string
                        It is the path of an .h5 file of the test features.
                        It contains at least the same datasets.
                - Method: string
                        Possible value are:
                                -'PCA': Principal component analysis
                                -'t-SNE': t-distributed Stochastic Neighbor Embedding
                                -'TruncatedSVD': Truncated SVD
                                -'NMF': Non-Negative Matrix Factorization
                                -'LDA': Linear Discriminant Analysis
                - params: dict
                        It is a dictionary containig parameters for the selected estimator.
                        Keys and possible values are listed on the following websites:
                        For t-SNE, an additional key is needed: params['reduce'] with possible values 'TruncatedSVD','PCA','None'.
                        It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD
                        for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is
                        very high. This will suppress some noise and speed up the computation of pairwise distances between samples.
                        - params['reduce']='TruncatedSVD' : Truncated SVD --> t-SNE
                        - params['reduce']='PCA' : PCA --> t-SNE
                        - params['reduce']='None' : t-SNE directly

                - X_train: array-like, shape (n_samples, n_components) 
                - X_test:  array-like, shape (n_samples, n_components) 
                - ax: matplotlib.axes._subplots.AxesSubplot object (if n_components<=3) or None (if n_components>3)    
                Furthermore, automatically 2 new .h5 files containing 3 datasets each (one for reduced features, one for labels and one for img_ids)
                are generated in the folder Results/ReducedFeatures and also if n_components is <= 3 a scatter plot is saved in the folder

        Example usage:
                import FeaturesReduction as fr
                import matplotlib.pyplot as plt

    s = os.sep
    # Load training features file
    train = h5py.File(TrainFeatures, 'r')
    train_features = train['feats']
    train_labels = train['labels']
    train_labels = np.squeeze(train_labels)
    train_img_ids = train['img_id']

    # Get categories of the training set from features ids
    categories = mf.get_categories(train_img_ids)

    # Load test features file
    test = h5py.File(TestFeatures, 'r')
    test_features = test['feats']
    test_labels = test['labels']
    test_labels = np.squeeze(test_labels)
    test_img_ids = test['img_id']

    n_comp = params['n_components']

    if Method != 'NMF':
        # Standardize features by removing the mean and scaling to unit variance
        scaler = StandardScaler().fit(train_features)
        train_features = scaler.transform(train_features)
        test_features = scaler.transform(test_features)

    if Method == 'PCA':
        # Get PCA model
        pca = PCA()
        # Set parameters
        # Fit the model with the training features and
        # apply dimensional reduction to training features
        X_train = pca.fit_transform(train_features)
        # Apply dimensional reduction to test features
        X_test = pca.transform(test_features)

    elif Method == 'NMF':
        params['verbose'] = True
        # Get NMF model
        nmf = NMF()
        # Set parameters
        # Fit the model with the training features and
        # apply dimensional reduction to training features
        X_train = nmf.fit_transform(train_features)
        # Apply dimensional reduction to test features
        X_test = nmf.transform(test_features)

    elif Method == 'LDA':
        # Get LDA model
        lda = LDA()
        # Set parameters
        # Fit the model with the training features
        # apply dimensional reduction to training features
        #X_train = lda.transform(train_features)

        X_train = lda.fit_transform(train_features, train_labels)
        # apply dimensional reduction to training features
        #X_train = lda.transform(train_features)
        # Apply dimensional reduction to test features
        X_test = lda.transform(test_features)

    elif Method == 't-SNE':
        red = params['reduce']
        del params['reduce']
        params['verbose'] = True

        # Use another dimensionality reduction method (PCA for dense
        # data or TruncatedSVD for sparse data) to reduce the number of
        # dimensions to a reasonable amount (e.g. 50) if the number of
        # features is very high. This will suppress some noise and speed
        # up the computation of pairwise distances between samples.
        if n_comp < 50:
            K = 50
            K = n_comp * 2
        if red == 'TruncatedSVD':
            # Get TruncatedSVD model
            svd = TruncatedSVD(n_components=K)
            # Fit the model with the training features and
            # apply dimensional reduction to training features
            train_features = svd.fit_transform(train_features)
            # Apply dimensional reduction to test features
            test_features = svd.transform(test_features)
        elif red == 'PCA':
            # Get PCA model
            pca = PCA(n_components=K)
            # Fit the model with the training features and
            # apply dimensional reduction to training features
            train_features = pca.fit_transform(train_features)
            # Apply dimensional reduction to test features
            test_features = pca.transform(test_features)

        # Get t-SNE model
        tsne = TSNE()
        # Set parameters
        # Concatenate training and test set
        n_train = train_features.shape[0]
        features = np.concatenate((train_features, test_features), axis=0)

        # Fit the model with the data and apply dimensional reduction
        X = tsne.fit_transform(features)

        # Separate training and test set
        X_train = X[:n_train, :]
        X_test = X[n_train:, :]

    elif Method == 'TruncatedSVD':
        # Get TruncatedSVD model
        svd = TruncatedSVD()
        # Set parameters
        # Fit the model with the training features and
        # apply dimensional reduction to training features
        X_train = svd.fit_transform(train_features)
        # Apply dimensional reduction to test features
        X_test = svd.transform(test_features)

        raise TypeError(
            "Invalid method: possible methods are 'PCA', 't-SNE', 'TruncatedSVD', 'NMF' and 'LDA'"

    # Create folder in which save reduced features
    mf.folders_creator('Results', ['ReducedFeatures'])

    # Create an .h5 file and store in it reduced training set
    name = 'Results' + s + 'ReducedFeatures' + s + Method + str(
        n_comp) + '_' + TrainFeatures.split(s)[-1].split('.')[0] + '.h5'
    f = h5py.File(name, "w")
    f.create_dataset('img_id', data=train_img_ids[:], dtype="S40")
    f.create_dataset('labels', data=train_labels.T, compression="gzip")
    if Method == 'PCA':
        f.create_dataset('pca', data=X_train.T, compression="gzip")
    elif Method == 't-SNE':
        f.create_dataset('tsne', data=X_train.T, compression="gzip")
    elif Method == 'TruncatedSVD':
        f.create_dataset('tsvd', data=X_train.T, compression="gzip")
    elif Method == 'LDA':
        f.create_dataset('lda', data=X_train.T, compression="gzip")
    elif Method == 'NMF':
        f.create_dataset('nmf', data=X_train.T, compression="gzip")

    # Create an .h5 file and store in it reduced test set
    name = 'Results' + s + 'ReducedFeatures' + s + Method + str(
        n_comp) + '_' + TestFeatures.split(s)[-1].split('.')[0] + '.h5'
    f = h5py.File(name, "w")
    f.create_dataset('img_id', data=test_img_ids[:], dtype="S40")
    f.create_dataset('labels', data=test_labels.T, compression="gzip")
    if Method == 'PCA':
        f.create_dataset('pca', data=X_test.T, compression="gzip")
    elif Method == 't-SNE':
        f.create_dataset('tsne', data=X_test.T, compression="gzip")
    elif Method == 'TruncatedSVD':
        f.create_dataset('tsvd', data=X_test.T, compression="gzip")
    elif Method == 'LDA':
        f.create_dataset('lda', data=X_test.T, compression="gzip")
    elif Method == 'NMF':
        f.create_dataset('nmf', data=X_test.T, compression="gzip")

    if n_comp < 4:

        # Get folders list of the test set from features ids
        test_folders = mf.get_categories(test_img_ids)
        # Get number of folders
        n_folders_test = len(test_folders)
        # Make some names for the plot legend
        tf = []
        for i in range(n_folders_test):
            tf.append('Test' + str(i))

        # Define a list of colors in exadecimal format
        if len(categories) + n_folders_test < 9:
            colors = [
                '#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#00FFFF',
                '#808080', '#FF00FF', '#000000'
            n = 250
            max_value = 255**3
            interval = int(max_value / n)
            colors = [
                '#' + hex(i)[2:].zfill(6)
                for i in range(0, max_value, interval)
            colors = colors[:int((n + 1) / 10 * 9)]

        # Create a folder to save images
        mf.folders_creator('Results', ['Plots'])

        # Create a name to save image
        name = Method + str(n_comp) + '_' + TrainFeatures.split(s)[-1].split(
        name = name.split('_')
        name = '_'.join(name[:-1])


        if n_comp == 1:
            # Plot 1D Data with different colors
            fig, ax = plt.subplots()
            for i in range(len(categories)):
                ax.scatter(X_train[train_labels == i, 0],
                           np.ones(X_train[train_labels == i, 0].shape),
            k = len(categories)
            for i in range(n_folders_test):
                ax.scatter(X_test[test_labels == i, 0],
                           np.ones(X_test[test_labels == i, 0].shape),
                k += 1

            # Save image in .png format
            plt.savefig('Results' + s + 'Plots' + s + name + '.png')

        if n_comp == 2:
            # Plot 2D Data with different colors
            fig, ax = plt.subplots()
            for i in range(len(categories)):
                ax.scatter(X_train[train_labels == i, 0],
                           X_train[train_labels == i, 1],
            k = len(categories)
            for i in range(n_folders_test):
                ax.scatter(X_test[test_labels == i, 0],
                           X_test[test_labels == i, 1],
                k += 1

            # Save image in .png format
            plt.savefig('Results' + s + 'Plots' + s + name + '.png')

            # Remove outliers
            out_train = mf.is_outlier(X_train, thresh=3.5)
            out_test = mf.is_outlier(X_test, thresh=3.5)
            out_train = np.logical_not(out_train)
            out_test = np.logical_not(out_test)

            X_train2 = X_train[out_train, :]
            X_test2 = X_test[out_test, :]

            if X_train2.shape[0] != X_train.shape[0] or X_test2.shape[
                    0] != X_test.shape[0]:

                train_labels2 = train_labels[out_train]
                test_labels2 = test_labels[out_test]

                # Plot 2D Data without outliers with different colors
                fig, ax = plt.subplots()
                for i in range(len(categories)):
                    ax.scatter(X_train2[train_labels2 == i, 0],
                               X_train2[train_labels2 == i, 1],
                k = len(categories)
                for i in range(n_folders_test):
                    ax.scatter(X_test2[test_labels2 == i, 0],
                               X_test2[test_labels2 == i, 1],
                    k += 1

                # Save image in .png format
                plt.savefig('Results' + s + 'Plots' + s + name +

        if n_comp == 3:
            mf.folders_creator('Results' + s + 'Plots', ['tmp'])
            # Plot 3-D Data with different colors
            ax = plt.subplot(111, projection='3d')
            for i in range(len(categories)):
                ax.scatter(X_train[train_labels == i, 0],
                           X_train[train_labels == i, 1],
                           X_train[train_labels == i, 2],
            k = len(categories)
            for i in range(n_folders_test):
                ax.scatter(X_test[test_labels == i, 0],
                           X_test[test_labels == i, 1],
                           X_test[test_labels == i, 2],
                k += 1
            ax.legend(loc='upper left',
                      bbox_to_anchor=(0, 0))

            # Rotate for 360° and save every 10°
            for angle in range(0, 360, 10):
                ax.view_init(30, angle)
                plt.savefig('Results' + s + 'Plots' + s + 'tmp' + s + name +
                            str(angle) + '.png')
            # Save as a .gif image
            mf.imagesfolder_to_gif('Results' + s + 'Plots' + s + name + '.gif',
                                   'Results' + s + 'Plots' + s + 'tmp', 0.2)
            shutil.rmtree('Results' + s + 'Plots' + s + 'tmp')

        ax = None

    return X_train, X_test, ax