示例#1
0
    def get_required_coverage(self, M=0.01):
        """Return the required coverage to ensure the genome is covered

        A general question is what should be the coverage to make sure
        that e.g. E=99% of the genome is covered by at least a read.

        The answer is:

        .. math:: \log^{-1/(E-1)}

        This equation is correct but have a limitation due to floating precision. 
        If one provides E=0.99, the answer is 4.6 but we are limited to a
        maximum coverage of about 36 when one provides E=0.9999999999999999
        after which E is rounded to 1 on most computers. Besides, it is no
        convenient to enter all those numbers. A scientific notation would be better but
        requires to work with :math:`M=1-E` instead of :math:`E`.

        .. math:: \log^{-1/ - M}

        So instead of asking the question what is the
        requested fold coverage to have 99% of the genome covered, we ask the question what
        is the requested fold coverage to have 1% of the genome not covered.
        This allows us to use :math:`M` values as low as 1e-300 that is a fold coverage 
        as high as 690.


        :param float M: this is the fraction of the genome not covered by
            any reads (e.g. 0.01 for 1%). See note above.
        :return: the required fold coverage

        .. plot::

            import pylab
            from sequana import Coverage
            cover = Coverage()
            misses = np.array([1e-1, 1e-2, 1e-3, 1e-4,1e-5,1e-6])
            required_coverage = cover.get_required_coverage(misses)
            pylab.semilogx(misses, required_coverage, 'o-')
            pylab.ylabel("Required coverage", fontsize=16)
            pylab.xlabel("Uncovered genome", fontsize=16)
            pylab.grid()

        # The inverse equation is required fold coverage = [log(-1/(E - 1))]
        """
        # What should be the fold coverage to have 99% of the genome sequenced ?
        # It is the same question as equating 1-e^{-(NL/G}) == 0.99, we need NL/G = 4.6
        if isinstance(M, float) or isinstance(M, int):
            assert M < 1
            assert M >= 0
        else:
            M = np.array(M)
        # Here we do not use log(-1/(E-1)) but log(-1/(1-E-1)) to allow
        # for using float down to 1e-300 since 0.999999999999999 == 1
        return np.log(-1 / (-M))
示例#2
0
# source
# http://nbviewer.ipython.org/github/tritemio/notebooks/blob/master/Mixture_Model_Fitting.ipynb

from easydev import DevTools, AttrDict
devtools = DevTools()

from sequana.lazy import numpy as np
from sequana.lazy import pylab

from . import criteria

import numpy as np

half_log_two_pi = 0.5 * np.log(2 * np.pi)

__all__ = ["Fitting", "GaussianMixtureModel", "GaussianMixtureFitting", "EM"]


class GaussianMixtureModel(object):
    """Gaussian Mixture Model

    .. plot::

        from sequana import mixture
        from pylab import plot, linspace
        m = mixture.GaussianMixtureModel(k=2)
        X = linspace(0,10,100)
        plot(X, [m.pdf(x, params=[1, 0.5, 0.2, 4, 0.5, 0.8]) for x in X])

    """
    def __init__(self, k=2):