예제 #1
0
    def get_lambda_pvalues(self, plam_mat, nlam_mat, bip_set=False):
        """Return the p-values for the :math:`\\Lambda`-motifs in ``nlam_mat``.

        Calculate the p-values for the numbers of observed
        :math:`\\Lambda`-motifs as given in the parameter ``nlam_mat`` for the
        bipartite node layer ``bip_set``. The probabilities for the single
        :math:`\\Lambda`-motifs are given in ``plam_mat``.

        If ``bip_set`` corresponds to the constrained bipartite node set, the
        :math:`\\Lambda`-motifs follow a Binomial probability distribution.
        Otherwise, all the node pairs follow the same Poisson Binomial
        probability distribution. The p-values are calculated as

        .. math::

            p_{val}(k) = Pr(X >= k) = 1 - Pr(X < k) = 1 - cdf(k) + pmf(k)

        .. note::
            The lower triangular part (including the diagonal) of the returned
            matrix is set to zero.

        :param plam_mat: matrix of :math:`\\Lambda`-motif probabilities
        :type plam_mat: numpy.array
        :param nlam_mat: matrix of observed number of Lambda motifs
        :type nlam_mat: numpy.array
        :param bip_set: selects row-nodes (``True``) or column-nodes (``False``)
        :type bip_set: bool
        :returns: matrix of the p-values for the :math:`\\Lambda`-motifs
        :rtype: numpy.array

        :raise NameError: raise an error if the parameter ``bip_set`` is
            neither ``True`` nor ``False``
        :raise AssertionError: raise an error if shapes of the probability
            matrix and the matrix with the number of :math:`\\Lambda`-motifs
            are not equal
        """
        if bip_set:
            m = self.num_columns
        elif not bip_set:
            m = self.num_rows
        else:
            errmsg = "'" + str(bip_set) + "' " + 'not supported.'
            raise NameError(errmsg)

        n = nlam_mat.shape[0]
        pval_mat = np.zeros(nlam_mat.shape)

        if bip_set != self.const_set:
            pb = PoiBin(plam_mat[np.diag_indices_from(plam_mat)])
            for i in xrange(n):
                pval_mat[i, i + 1:] = pb.pval(nlam_mat[i, i + 1:])
        elif bip_set == self.const_set:
            # if the sets correspond, the matrix dimensions should be the same
            assert plam_mat.shape[0] == nlam_mat.shape[0]
            for i in xrange(n):
                for j in xrange(i + 1, n):
                    bn = binom(m, plam_mat[i, j])
                    pval_mat[i, j] = 1. - bn.cdf(nlam_mat[i, j]) \
                                     + bn.pmf(nlam_mat[i, j])
        return pval_mat
예제 #2
0
 def pval_process_worker(self):
     """Calculate p-values and add them to the out-queue."""
     # take elements from the queue as long as the element is not "STOP"
     for tupl in iter(self.input_queue.get, "STOP"):
         pb = PoiBin(tupl[1])
         pv = pb.pval(int(tupl[2]))
         # add the result to the output queue
         self.output_queue.put((tupl[0], pv))
     # once all the elements in the input queue have been dealt with, add a
     # "STOP" to the output queue
     self.output_queue.put("STOP")
예제 #3
0
    def lambda_motifs(self, bip_set, parallel=True, filename=None,
            delim='\t', binary=True, num_chunks=4):
        """Calculate and save the p-values of the :math:`\\Lambda`-motifs.

        For each node couple in the bipartite layer specified by ``bip_set``,
        calculate the p-values of the corresponding :math:`\\Lambda`-motifs
        according to the link probabilities in the biadjacency matrix of the
        BiCM null model.

        The results can be saved either as a binary ``.npy`` or a
        human-readable ``.csv`` file, depending on ``binary``.

        .. note::

            * The total number of p-values that are calculated is split into
              ``num_chunks`` chunks, which are processed sequentially in order
              to avoid memory allocation errors. Note that a larger value of
              ``num_chunks`` will lead to less memory occupation, but comes at
              the cost of slower processing speed.

            * The output consists of a one-dimensional array of p-values. If
              the bipartite layer ``bip_set`` contains ``n`` nodes, this means
              that the array will contain :math:`\\binom{n}{2}` entries. The
              indices ``(i, j)`` of the nodes corresponding to entry ``k`` in
              the array can be reconstructed using the method
              :func:`BiCM.flat2_triumat_idx`. The number of nodes ``n``
              can be recovered from the length of the array with
              :func:`BiCM.flat2_triumat_dim`

            * If ``binary == False``, the ``filename`` should end with
              ``.csv``. If ``binary == True``, it will be saved in binary NumPy
              ``.npy`` format and the suffix ``.npy`` will be appended
              automatically. By default, the file is saved in binary format.

        :param bip_set: select row-nodes (``True``) or column-nodes (``False``)
        :type bip_set: bool
        :param parallel: select whether the calculation of the p-values should
            be run in parallel (``True``) or not (``False``)
        :type parallel: bool
        :param filename: name of the output file
        :type filename: str
        :param delim: delimiter between entries in the ``.csv``file, default is
            ``\\t``
        :type delim: str
        :param binary: if ``True``, the file will be saved in the binary
            NumPy format ``.npy``, otherwise as ``.csv``
        :type binary: bool
        :param num_chunks: number of chunks of p-value calculations that are
            performed sequentially
        :type num_chunks: int
        :raise ValueError: raise an error if the parameter ``bip_set`` is
            neither ``True`` nor ``False``
        """
        if (type(bip_set) == bool) and bip_set:
            biad_mat = self.adj_matrix
            bin_mat = self.bin_mat
        elif (type(bip_set) == bool) and not bip_set:
            biad_mat = np.transpose(self.adj_matrix)
            bin_mat = np.transpose(self.bin_mat)
        else:
            errmsg = "'" + str(bip_set) + "' " + 'not supported.'
            raise NameError(errmsg)

        n = self.get_triup_dim(bip_set)
        pval = np.ones(shape=(n, ), dtype='float') * (-0.1)

        # handle layers of dimension 2 separately
        if n == 1:
            nlam = np.dot(bin_mat[0, :], bin_mat[1, :].T)
            plam = biad_mat[0, :] * biad_mat[1, :]
            pb = PoiBin(plam)
            pval[0] = pb.pval(nlam)
        else:
            # if the dimension of the network is too large, split the
            # calculations # of the p-values in ``m`` intervals to avoid memory
            # allocation errors
            if n > 100:
                kk = self.split_range(n, m=num_chunks)
            else:
                kk = [0]
            # calculate p-values for index intervals
            for i in range(len(kk) - 1):
                k1 = kk[i]
                k2 = kk[i + 1]
                nlam = self.get_lambda_motif_block(bin_mat, k1, k2)
                plam = self.get_plambda_block(biad_mat, k1, k2)
                pv = self.get_pvalues_q(plam, nlam, k1, k2)
                pval[k1:k2] = pv
            # last interval
            k1 = kk[len(kk) - 1]
            k2 = n - 1
            nlam = self.get_lambda_motif_block(bin_mat, k1, k2)
            plam = self.get_plambda_block(biad_mat, k1, k2)
            # for the last entry we have to INCLUDE k2, thus k2 + 1
            pv = self.get_pvalues_q(plam, nlam, k1, k2 + 1)
            pval[k1:] = pv
        # check that all p-values have been calculated
#        assert np.all(pval >= 0) and np.all(pval <= 1)
        if filename is None:
            fname = 'p_values_' + str(bip_set)
            if not binary:
                fname +=  '.csv'
        else:
            fname = filename
        # account for machine precision:
        pval += np.finfo(np.float).eps
        self.save_array(pval, filename=fname, delim=delim,
                         binary=binary)