示例#1
0
 def __init__(self, n_neighbors = 5, radius = 1.0, fast=False, number_of_hash_functions=400,
              max_bin_size = 50, minimal_blocks_in_common = 1, shingle_size = 4, excess_factor = 5,
              similarity=False, number_of_cores=None, chunk_size=None, prune_inverse_index=-1,
               prune_inverse_index_after_instance=-1.0, remove_hash_function_with_less_entries_as=-1, 
              block_size = 5, shingle=0, store_value_with_least_sigificant_bit=0, 
               gpu_hashing=0, speed_optimized=None, accuracy_optimized=None): #cpu_gpu_load_balancing=0, 
     self._minHash = MinHash(n_neighbors=n_neighbors, radius=radius,
             fast=fast, number_of_hash_functions=number_of_hash_functions,
             max_bin_size=max_bin_size, minimal_blocks_in_common=minimal_blocks_in_common,
             shingle_size=shingle_size, excess_factor=excess_factor,
             similarity=similarity, number_of_cores=number_of_cores, chunk_size=chunk_size, prune_inverse_index=prune_inverse_index,
             prune_inverse_index_after_instance=prune_inverse_index_after_instance,
             remove_hash_function_with_less_entries_as=remove_hash_function_with_less_entries_as, 
             block_size=block_size, shingle=shingle,
             store_value_with_least_sigificant_bit=store_value_with_least_sigificant_bit, 
             cpu_gpu_load_balancing=0, gpu_hashing=gpu_hashing,
             speed_optimized=speed_optimized, accuracy_optimized=accuracy_optimized)
 def __init__(self, n_neighbors = 5, radius = 1.0, fast=False, number_of_hash_functions=400,
              max_bin_size = 50, minimal_blocks_in_common = 1, shingle_size = 4, excess_factor = 5,
              similarity=False, number_of_cores=None, chunk_size=None, prune_inverse_index=-1,
               prune_inverse_index_after_instance=-1.0, remove_hash_function_with_less_entries_as=-1, 
              block_size = 5, shingle=0, store_value_with_least_sigificant_bit=0, 
               gpu_hashing=0, speed_optimized=None, accuracy_optimized=None): #cpu_gpu_load_balancing=0, 
     self._minHash = MinHash(n_neighbors=n_neighbors, radius=radius,
             fast=fast, number_of_hash_functions=number_of_hash_functions,
             max_bin_size=max_bin_size, minimal_blocks_in_common=minimal_blocks_in_common,
             shingle_size=shingle_size, excess_factor=excess_factor,
             similarity=similarity, number_of_cores=number_of_cores, chunk_size=chunk_size, prune_inverse_index=prune_inverse_index,
             prune_inverse_index_after_instance=prune_inverse_index_after_instance,
             remove_hash_function_with_less_entries_as=remove_hash_function_with_less_entries_as, 
             block_size=block_size, shingle=shingle,
             store_value_with_least_sigificant_bit=store_value_with_least_sigificant_bit, 
             cpu_gpu_load_balancing=0, gpu_hashing=gpu_hashing,
             speed_optimized=speed_optimized, accuracy_optimized=accuracy_optimized)
示例#3
0
class MinHashClassifier():
    """Classifier implementing the k-nearest neighbors vote on sparse data sets.
        Based on a dimension reduction with minimum hash functions.
        
        Parameters
        ----------
        n_neighbors : int, optional (default = 5)
            Number of neighbors to use by default for :meth:`k_neighbors` queries.
        fast : {True, False}, optional (default = False)
            - True:     will only use an inverse index to compute a k_neighbor query.
            - False:    an inverse index is used to preselect instances, and these are used to get
                        the original data from the data set to answer a k_neighbor query. The
                        original data is stored in the memory.
        number_of_hash_functions : int, optional (default = '400')
            Number of hash functions to use for computing the inverse index.
        max_bin_size : int, optional (default = 50)
            The number of maximal collisions for one hash value of one hash function. If one value of a hash function
            has more collisions, this value will be ignored.
        minimal_blocks_in_common : int, optional (default = 1)
            The minimal number of hash collisions two instances have to be in common to be recognised. Everything less
            will be ignored.
        shingle_size : int, optional (default = 4)
            Reduction factor for the signature size.
            E.g. number_of_hash_functions=400 and shingle_size=4 --> Size of the signature will be 100
        excess_factor : int, optional (default = 5)
            Factor to return more neighbors internally as defined with n_neighbors. Factor is useful to increase the
            precision of the :meth:`algorithm=exact` version of the implementation.
            E.g.: n_neighbors = 5, excess_factor = 5. Internally n_neighbors*excess_factor = 25 neighbors will be returned.
            Now the reduced data set for sklearn.NearestNeighbors is of size 25 and not 5.
        number_of_cores : int, optional
            Number of cores that should be used for openmp. If your system doesn't support openmp, this value
            will have no effect. If it supports openmp and it is not defined, the maximum number of cores is used.
        chunk_size : int, optional
            Number of elements one cpu core should work on. If it is set to "0" the default behaviour of openmp is used;
            e.g. for an 8-core cpu,  the chunk_size is set to 8. Every core will get 8 elements, process these and get
            another 8 elements until everything is done. If you set chunk_size to "-1" all cores
            are getting the same amount of data at once; e.g. 8-core cpu and 128 elements to process, every core will
            get 16 elements at once.
        
        Notes
        -----

        The documentation is copied from scikit-learn and was only extend for a few cases. All examples are available there.
        Original documentation is available at: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
        
        Sources:
        Basic algorithm:
        http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm

        Idea behind implementation:
        https://en.wikipedia.org/wiki/Locality-sensitive_hashing

        Implementation is using scikit learn:
        http://scikit-learn.org/dev/index.html
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier

        Algorithm based on:
        Heyne, S., Costa, F., Rose, D., & Backofen, R. (2012).
        GraphClust: alignment-free structural clustering of local RNA secondary structures.
        Bioinformatics, 28(12), i224-i232.
        http://bioinformatics.oxfordjournals.org/content/28/12/i224.full.pdf+html"""

    def __init__(self, n_neighbors = 5, radius = 1.0, fast=False, number_of_hash_functions=400,
                 max_bin_size = 50, minimal_blocks_in_common = 1, shingle_size = 4, excess_factor = 5,
                 similarity=False, number_of_cores=None, chunk_size=None, prune_inverse_index=-1,
                  prune_inverse_index_after_instance=-1.0, remove_hash_function_with_less_entries_as=-1, 
                 block_size = 5, shingle=0, store_value_with_least_sigificant_bit=0, 
                  gpu_hashing=0, speed_optimized=None, accuracy_optimized=None): #cpu_gpu_load_balancing=0, 
        self._minHash = MinHash(n_neighbors=n_neighbors, radius=radius,
                fast=fast, number_of_hash_functions=number_of_hash_functions,
                max_bin_size=max_bin_size, minimal_blocks_in_common=minimal_blocks_in_common,
                shingle_size=shingle_size, excess_factor=excess_factor,
                similarity=similarity, number_of_cores=number_of_cores, chunk_size=chunk_size, prune_inverse_index=prune_inverse_index,
                prune_inverse_index_after_instance=prune_inverse_index_after_instance,
                remove_hash_function_with_less_entries_as=remove_hash_function_with_less_entries_as, 
                block_size=block_size, shingle=shingle,
                store_value_with_least_sigificant_bit=store_value_with_least_sigificant_bit, 
                cpu_gpu_load_balancing=0, gpu_hashing=gpu_hashing,
                speed_optimized=speed_optimized, accuracy_optimized=accuracy_optimized)
    
    def __del__(self):
        del self._minHash
    
    def fit(self, X, y):
        """Fit the model using X as training data.

            Parameters
            ----------
            X : {array-like, sparse matrix}
                Training data, shape = [n_samples, n_features]
            y : {array-like, sparse matrix}
                Target values of shape = [n_samples] or [n_samples, n_outputs]"""
        self._minHash.fit(X, y)
       
    def partial_fit(self, X, y):
        """Extend the model by X as additional training data.

            Parameters
            ----------
            X : {array-like, sparse matrix}
                Training data. Shape = [n_samples, n_features]
            y : {array-like, sparse matrix}
                Target values of shape = [n_samples] or [n_samples, n_outputs]"""
        self._minHash.partial_fit(X, y)

    def kneighbors(self, X = None, n_neighbors = None, return_distance = True, fast=None):
        """Finds the K-neighbors of a point.

            Returns distance

            Parameters
            ----------
            X : array-like, last dimension same as that of fit data, optional
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            n_neighbors : int
                Number of neighbors to get (default is the value
                passed to the constructor).
            return_distance : boolean, optional. Defaults to True.
                If False, distances will not be returned
            fast : {True, False}, optional (default = False)
                - True:     will only use an inverse index to compute a k_neighbor query.
                - False:    an inverse index is used to preselect instances, and these are used to get
                            the original data from the data set to answer a k_neighbor query. The
                            original data is stored in the memory.
                If not defined, default value given by constructor is used.
            Returns
            -------
            dist : array, shape = [n_samples, distance]
                Array representing the lengths to points, only present if
                return_distance=True
            ind : array, shape = [n_samples, neighbors]
                Indices of the nearest points in the population matrix."""
        
        return self._minHash.kneighbors(X=X, n_neighbors=n_neighbors, return_distance=return_distance, fast=fast)


    def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity', fast=None):
        """Computes the (weighted) graph of k-Neighbors for points in X
            Parameters
            ----------
            X : array-like, last dimension same as that of fit data, optional
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            n_neighbors : int
                Number of neighbors for each sample.
                (default is value passed to the constructor).
            mode : {'connectivity', 'distance'}, optional
                Type of returned matrix: 'connectivity' will return the
                connectivity matrix with ones and zeros, in 'distance' the
                edges are Euclidean distance between points.
            fast : {True, False}, optional (default = False)
                - True:     will only use an inverse index to compute a k_neighbor query.
                - False:    an inverse index is used to preselect instances, and these are used to get
                            the original data from the data set to answer a k_neighbor query. The
                            original data is stored in the memory.
                If not defined, default value given by constructor is used.
            Returns
            -------
            A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]
                n_samples_fit is the number of samples in the fitted data
                A[i, j] is assigned the weight of edge that connects i to j."""
        return self._minHash.kneighbors_graph(X=X, n_neighbors=n_neighbors, mode=mode, fast=fast)


    def predict(self, X, n_neighbors=None, fast=None, similarity=None):
        """Predict the class labels for the provided data
        Parameters
        ----------
            X : array of shape [n_samples, n_features]
                A 2-D array representing the test points.
            Returns
            -------
            y : array of shape [n_samples] or [n_samples, n_outputs]
                Class labels for each data sample.
        """
        neighbors = self._minHash.kneighbors(X=X, n_neighbors=n_neighbors,
                                                return_distance=False,
                                                fast=fast, similarity=similarity)
        
        result_classification = []
        for instance in neighbors:
            y_value = []
            for instance_ in instance:
                if instance_ != -1:
                # get all class labels
                    # y_value.append(y_values[instance_])
                    y_value.append(self._minHash._getY()[instance_])
            if len(y_value) > 0:
                # sort class labels by frequency and take the highest one
                result_classification.append(Counter(y_value).keys()[0])
            else:
                result_classification.append(-1)
        return asarray(result_classification)



    def predict_proba(self, X, n_neighbors=None, fast=None, similarity=None):
        """Return probability estimates for the test data X.
            Parameters
            ----------
            X : array, shape = (n_samples, n_features)
                A 2-D array representing the test points.
            Returns
            -------
            p : array of shape = [n_samples, n_classes], or a list of n_outputs
                of such arrays if n_outputs > 1.
                The class probabilities of the input samples. Classes are ordered
                by lexicographic order.
        """
        neighbors = self._minHash.kneighbors(X=X, n_neighbors=n_neighbors,
                                                return_distance=False,
                                                fast=fast, similarity=similarity)
        # y_values = self._getYValues(candidate_list)
        number_of_classes = len(set(self._minHash._getY()))
        result_classification = []
        for instance in neighbors:
            y_value = []
            for instance_ in instance:
                if instance_ != -1:
                # get all class labels
                    y_value.append(self._minHash._getY()[instance_])
            if len(y_value) > 0:
            
                # sort class labels by frequency
                y_proba = [0.0] * number_of_classes
                sorted_classes = Counter(y_value)
                # count frequency of all clases
                total_class_count = 0
                for value in sorted_classes.itervalues():
                    total_class_count += value
                # compute probability by frequency / all_frequencies
                for key, value in sorted_classes.iteritems():
                    y_proba[key] = value / float(total_class_count)
                result_classification.append(y_proba[:])
        return asarray(result_classification)
        
    def score(self, X, y , sample_weight=None, fast=None):
        """Returns the mean accuracy on the given test data and labels.
        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.
        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True labels for X.
        sample_weight : array-like, shape = [n_samples], optional
            Sample weights.
        Returns
        -------
        score : float
            Mean accuracy of self.predict(X) wrt. y.
        """
        
        return accuracy_score(y, self.predict(X, fast=fast), sample_weight=sample_weight)
class MinHashClassifier():
    """Classifier implementing the k-nearest neighbors vote on sparse data sets.
        Based on a dimension reduction with minimum hash functions.
        
        Parameters
        ----------
        n_neighbors : int, optional (default = 5)
            Number of neighbors to use by default for :meth:`k_neighbors` queries.
        fast : {True, False}, optional (default = False)
            - True:     will only use an inverse index to compute a k_neighbor query.
            - False:    an inverse index is used to preselect instances, and these are used to get
                        the original data from the data set to answer a k_neighbor query. The
                        original data is stored in the memory.
        number_of_hash_functions : int, optional (default = '400')
            Number of hash functions to use for computing the inverse index.
        max_bin_size : int, optional (default = 50)
            The number of maximal collisions for one hash value of one hash function. If one value of a hash function
            has more collisions, this value will be ignored.
        minimal_blocks_in_common : int, optional (default = 1)
            The minimal number of hash collisions two instances have to be in common to be recognised. Everything less
            will be ignored.
        shingle_size : int, optional (default = 4)
            Reduction factor for the signature size.
            E.g. number_of_hash_functions=400 and shingle_size=4 --> Size of the signature will be 100
        excess_factor : int, optional (default = 5)
            Factor to return more neighbors internally as defined with n_neighbors. Factor is useful to increase the
            precision of the :meth:`algorithm=exact` version of the implementation.
            E.g.: n_neighbors = 5, excess_factor = 5. Internally n_neighbors*excess_factor = 25 neighbors will be returned.
            Now the reduced data set for sklearn.NearestNeighbors is of size 25 and not 5.
        number_of_cores : int, optional
            Number of cores that should be used for openmp. If your system doesn't support openmp, this value
            will have no effect. If it supports openmp and it is not defined, the maximum number of cores is used.
        chunk_size : int, optional
            Number of elements one cpu core should work on. If it is set to "0" the default behaviour of openmp is used;
            e.g. for an 8-core cpu,  the chunk_size is set to 8. Every core will get 8 elements, process these and get
            another 8 elements until everything is done. If you set chunk_size to "-1" all cores
            are getting the same amount of data at once; e.g. 8-core cpu and 128 elements to process, every core will
            get 16 elements at once.
        
        Notes
        -----

        The documentation is copied from scikit-learn and was only extend for a few cases. All examples are available there.
        Original documentation is available at: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
        
        Sources:
        Basic algorithm:
        http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm

        Idea behind implementation:
        https://en.wikipedia.org/wiki/Locality-sensitive_hashing

        Implementation is using scikit learn:
        http://scikit-learn.org/dev/index.html
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier

        Algorithm based on:
        Heyne, S., Costa, F., Rose, D., & Backofen, R. (2012).
        GraphClust: alignment-free structural clustering of local RNA secondary structures.
        Bioinformatics, 28(12), i224-i232.
        http://bioinformatics.oxfordjournals.org/content/28/12/i224.full.pdf+html"""

    def __init__(self, n_neighbors = 5, radius = 1.0, fast=False, number_of_hash_functions=400,
                 max_bin_size = 50, minimal_blocks_in_common = 1, shingle_size = 4, excess_factor = 5,
                 similarity=False, number_of_cores=None, chunk_size=None, prune_inverse_index=-1,
                  prune_inverse_index_after_instance=-1.0, remove_hash_function_with_less_entries_as=-1, 
                 block_size = 5, shingle=0, store_value_with_least_sigificant_bit=0, 
                  gpu_hashing=0, speed_optimized=None, accuracy_optimized=None): #cpu_gpu_load_balancing=0, 
        self._minHash = MinHash(n_neighbors=n_neighbors, radius=radius,
                fast=fast, number_of_hash_functions=number_of_hash_functions,
                max_bin_size=max_bin_size, minimal_blocks_in_common=minimal_blocks_in_common,
                shingle_size=shingle_size, excess_factor=excess_factor,
                similarity=similarity, number_of_cores=number_of_cores, chunk_size=chunk_size, prune_inverse_index=prune_inverse_index,
                prune_inverse_index_after_instance=prune_inverse_index_after_instance,
                remove_hash_function_with_less_entries_as=remove_hash_function_with_less_entries_as, 
                block_size=block_size, shingle=shingle,
                store_value_with_least_sigificant_bit=store_value_with_least_sigificant_bit, 
                cpu_gpu_load_balancing=0, gpu_hashing=gpu_hashing,
                speed_optimized=speed_optimized, accuracy_optimized=accuracy_optimized)
    
    def __del__(self):
        del self._minHash
    
    def fit(self, X, y):
        """Fit the model using X as training data.

            Parameters
            ----------
            X : {array-like, sparse matrix}
                Training data, shape = [n_samples, n_features]
            y : {array-like, sparse matrix}
                Target values of shape = [n_samples] or [n_samples, n_outputs]"""
        self._minHash.fit(X, y)
       
    def partial_fit(self, X, y):
        """Extend the model by X as additional training data.

            Parameters
            ----------
            X : {array-like, sparse matrix}
                Training data. Shape = [n_samples, n_features]
            y : {array-like, sparse matrix}
                Target values of shape = [n_samples] or [n_samples, n_outputs]"""
        self._minHash.partial_fit(X, y)

    def kneighbors(self, X = None, n_neighbors = None, return_distance = True, fast=None):
        """Finds the K-neighbors of a point.

            Returns distance

            Parameters
            ----------
            X : array-like, last dimension same as that of fit data, optional
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            n_neighbors : int
                Number of neighbors to get (default is the value
                passed to the constructor).
            return_distance : boolean, optional. Defaults to True.
                If False, distances will not be returned
            fast : {True, False}, optional (default = False)
                - True:     will only use an inverse index to compute a k_neighbor query.
                - False:    an inverse index is used to preselect instances, and these are used to get
                            the original data from the data set to answer a k_neighbor query. The
                            original data is stored in the memory.
                If not defined, default value given by constructor is used.
            Returns
            -------
            dist : array, shape = [n_samples, distance]
                Array representing the lengths to points, only present if
                return_distance=True
            ind : array, shape = [n_samples, neighbors]
                Indices of the nearest points in the population matrix."""
        
        return self._minHash.kneighbors(X=X, n_neighbors=n_neighbors, return_distance=return_distance, fast=fast)


    def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity', fast=None):
        """Computes the (weighted) graph of k-Neighbors for points in X
            Parameters
            ----------
            X : array-like, last dimension same as that of fit data, optional
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            n_neighbors : int
                Number of neighbors for each sample.
                (default is value passed to the constructor).
            mode : {'connectivity', 'distance'}, optional
                Type of returned matrix: 'connectivity' will return the
                connectivity matrix with ones and zeros, in 'distance' the
                edges are Euclidean distance between points.
            fast : {True, False}, optional (default = False)
                - True:     will only use an inverse index to compute a k_neighbor query.
                - False:    an inverse index is used to preselect instances, and these are used to get
                            the original data from the data set to answer a k_neighbor query. The
                            original data is stored in the memory.
                If not defined, default value given by constructor is used.
            Returns
            -------
            A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]
                n_samples_fit is the number of samples in the fitted data
                A[i, j] is assigned the weight of edge that connects i to j."""
        return self._minHash.kneighbors_graph(X=X, n_neighbors=n_neighbors, mode=mode, fast=fast)


    def predict(self, X, n_neighbors=None, fast=None, similarity=None):
        """Predict the class labels for the provided data
        Parameters
        ----------
            X : array of shape [n_samples, n_features]
                A 2-D array representing the test points.
            Returns
            -------
            y : array of shape [n_samples] or [n_samples, n_outputs]
                Class labels for each data sample.
        """
        neighbors = self._minHash.kneighbors(X=X, n_neighbors=n_neighbors,
                                                return_distance=False,
                                                fast=fast, similarity=similarity)
        
        result_classification = []
        for instance in neighbors:
            y_value = []
            for instance_ in instance:
                if instance_ != -1:
                # get all class labels
                    # y_value.append(y_values[instance_])
                    y_value.append(self._minHash._getY()[instance_])
            if len(y_value) > 0:
                # sort class labels by frequency and take the highest one
                result_classification.append(Counter(y_value).keys()[0])
            else:
                result_classification.append(-1)
        return asarray(result_classification)



    def predict_proba(self, X, n_neighbors=None, fast=None, similarity=None):
        """Return probability estimates for the test data X.
            Parameters
            ----------
            X : array, shape = (n_samples, n_features)
                A 2-D array representing the test points.
            Returns
            -------
            p : array of shape = [n_samples, n_classes], or a list of n_outputs
                of such arrays if n_outputs > 1.
                The class probabilities of the input samples. Classes are ordered
                by lexicographic order.
        """
        neighbors = self._minHash.kneighbors(X=X, n_neighbors=n_neighbors,
                                                return_distance=False,
                                                fast=fast, similarity=similarity)
        # y_values = self._getYValues(candidate_list)
        number_of_classes = len(set(self._minHash._getY()))
        result_classification = []
        for instance in neighbors:
            y_value = []
            for instance_ in instance:
                if instance_ != -1:
                # get all class labels
                    y_value.append(self._minHash._getY()[instance_])
            if len(y_value) > 0:
            
                # sort class labels by frequency
                y_proba = [0.0] * number_of_classes
                sorted_classes = Counter(y_value)
                # count frequency of all clases
                total_class_count = 0
                for value in sorted_classes.itervalues():
                    total_class_count += value
                # compute probability by frequency / all_frequencies
                for key, value in sorted_classes.iteritems():
                    y_proba[key] = value / float(total_class_count)
                result_classification.append(y_proba[:])
        return asarray(result_classification)
        
    def score(self, X, y , sample_weight=None, fast=None):
        """Returns the mean accuracy on the given test data and labels.
        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Test samples.
        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True labels for X.
        sample_weight : array-like, shape = [n_samples], optional
            Sample weights.
        Returns
        -------
        score : float
            Mean accuracy of self.predict(X) wrt. y.
        """
        
        return accuracy_score(y, self.predict(X, fast=fast), sample_weight=sample_weight)