def get_required_coverage(self, M=0.01): """Return the required coverage to ensure the genome is covered A general question is what should be the coverage to make sure that e.g. E=99% of the genome is covered by at least a read. The answer is: .. math:: \log^{-1/(E-1)} This equation is correct but have a limitation due to floating precision. If one provides E=0.99, the answer is 4.6 but we are limited to a maximum coverage of about 36 when one provides E=0.9999999999999999 after which E is rounded to 1 on most computers. Besides, it is no convenient to enter all those numbers. A scientific notation would be better but requires to work with :math:`M=1-E` instead of :math:`E`. .. math:: \log^{-1/ - M} So instead of asking the question what is the requested fold coverage to have 99% of the genome covered, we ask the question what is the requested fold coverage to have 1% of the genome not covered. This allows us to use :math:`M` values as low as 1e-300 that is a fold coverage as high as 690. :param float M: this is the fraction of the genome not covered by any reads (e.g. 0.01 for 1%). See note above. :return: the required fold coverage .. plot:: import pylab from sequana import Coverage cover = Coverage() misses = np.array([1e-1, 1e-2, 1e-3, 1e-4,1e-5,1e-6]) required_coverage = cover.get_required_coverage(misses) pylab.semilogx(misses, required_coverage, 'o-') pylab.ylabel("Required coverage", fontsize=16) pylab.xlabel("Uncovered genome", fontsize=16) pylab.grid() # The inverse equation is required fold coverage = [log(-1/(E - 1))] """ # What should be the fold coverage to have 99% of the genome sequenced ? # It is the same question as equating 1-e^{-(NL/G}) == 0.99, we need NL/G = 4.6 if isinstance(M, float) or isinstance(M, int): assert M < 1 assert M >= 0 else: M = np.array(M) # Here we do not use log(-1/(E-1)) but log(-1/(1-E-1)) to allow # for using float down to 1e-300 since 0.999999999999999 == 1 return np.log(-1 / (-M))
# source # http://nbviewer.ipython.org/github/tritemio/notebooks/blob/master/Mixture_Model_Fitting.ipynb from easydev import DevTools, AttrDict devtools = DevTools() from sequana.lazy import numpy as np from sequana.lazy import pylab from . import criteria import numpy as np half_log_two_pi = 0.5 * np.log(2 * np.pi) __all__ = ["Fitting", "GaussianMixtureModel", "GaussianMixtureFitting", "EM"] class GaussianMixtureModel(object): """Gaussian Mixture Model .. plot:: from sequana import mixture from pylab import plot, linspace m = mixture.GaussianMixtureModel(k=2) X = linspace(0,10,100) plot(X, [m.pdf(x, params=[1, 0.5, 0.2, 4, 0.5, 0.8]) for x in X]) """ def __init__(self, k=2):