afmin = 3e-3 indfm = (af1 >= afmin) & (af1 <= 1 - afmin) & (af2 >= afmin) & (af2 <= 1 - afmin) nsites = len(np.unique(indfm.nonzero()[1])) # Estimate the template number mea = 0.5 * (af1[indfm] + af2[indfm]) var = ((af1[indfm] - af2[indfm]) / 2)**2 # In binomial sampling, the variance on k is var(k) = nx (1 - x), so # for the frequency var(k/n) = x (1 - x) / n n_all = mea * (1 - mea) / var # NOTE: pseudocounts that come from the F4 dilution estimate, so we # only listen to the data if there is enough data points to listen to len_pseudo = 1 n_pseudo = sample.get_n_templates_dilutions() n_allp = np.concatenate([n_all, ([n_pseudo] * len_pseudo)]) if VERBOSE >= 2: print 'Number of doubly polymorphic sites:', nsites, 'n_pseudo:', n_pseudo # NOTE: the estimate of n has a bad distribution because some points are # exactly on the diagonal, so we average the inverse (which is well # behaved) and also take the medians as alternatives n = 1.0 / (1.0 / n_allp).mean() ninv = n_allp.mean() nmed = np.median(n_allp) if VERBOSE >= 2: print fr1, fr2, n, ninv, nmed key = (samplename, fr1, fr2)