def set_fast_stats(self): try: self.alignments except: self._set_alignments() reference_start = [this.reference_start for this in self.alignments] reference_end = [this.reference_end for this in self.alignments] N = max([this for this in reference_end if this]) self.coverage = np.zeros(N) for x, y in zip(reference_start, reference_end): if y and x >= 0 and y >= 0: self.coverage[x:y] += 1 else: pass self.insertions = [] self.deletions = [] for this in self.alignments: if this.cigarstring: if "I" in this.cigarstring: self.insertions.extend( [x[1] for x in this.cigartuples if x[0] == 1]) if "D" in this.cigarstring: self.deletions.extend( [x[1] for x in this.cigartuples if x[0] == 2])
def _get_stats(self): filenames, mode = self._get_files("*.json") cols = ["A", "C", "G", "T", "N", "n_reads", "mean quality" , "GC content", "average read length", "total bases"] N = len(filenames) df = pd.DataFrame(np.zeros((N, 10)), columns=cols) indices = [] for i, filename in enumerate(filenames): if self.tag_R1 in filename: index = "R1" else: index = "R2" if "unmapped" in filename: index += ".unmapped" else: index += ".mapped" indices.append(index) try: # Use a try since the subdf may be empty subdf = pd.read_json(filename) df.iloc[i] = subdf.iloc[0] df.iloc[i]["A"] /= df.iloc[i]["n_reads"] df.iloc[i]["C"] /= df.iloc[i]["n_reads"] df.iloc[i]["G"] /= df.iloc[i]["n_reads"] df.iloc[i]["T"] /= df.iloc[i]["n_reads"] df.iloc[i]["N"] /= df.iloc[i]["n_reads"] except: pass df.index = indices df = df.astype({"n_reads": np.int64, "total bases": np.int64}) return df
def _init_cumul_nuc(self): """Cumulative of nucleotide count along genome (init from first window)""" # ATGC (index stored in self._dict_nuc) cumul = np.zeros((4, (self.__len__() + self._window))) for j in range(self._window): nuc = self.sequence[j] if nuc in self._dict_nuc: cumul[self._dict_nuc[nuc]][j] += 1 self._cumul = cumul
def _init_cumul_nuc(self): """Cumulative of nucleotide count along genome (init from first window)""" # ATGC (index stored in self._dict_nuc) cumul = np.zeros((4,(self.__len__()+self._window) )) for j in range(self._window): nuc = self.sequence[j] if nuc in self._dict_nuc: cumul[self._dict_nuc[nuc]][j] += 1 self._cumul = cumul
def _set_coverage(self): try: self.alignments except: self._set_alignments() reference_start = [this.reference_start for this in self.alignments] reference_end = [this.reference_end for this in self.alignments] N = max([this for this in reference_end if this]) self.coverage = np.zeros(N) for x, y in zip(reference_start, reference_end): if y and x>=0 and y>=0: self.coverage[x:y] += 1 else: pass
def get_actg_content(self, max_sample=500000): try: self.alignments except: self._set_alignments() # what is the longest string ? max_length = max((len(a.seq) for a in self.alignments)) import re df = pd.DataFrame(np.zeros((max_length,5)), columns=['A', 'C', 'G', 'T', 'N']) A = np.zeros(max_length) C = np.zeros(max_length) G = np.zeros(max_length) T = np.zeros(max_length) N = np.zeros(max_length) for a in self.alignments: pos = [m.start() for m in re.finditer("A", a.seq)] A[pos] += 1 C[[m.start() for m in re.finditer("C", a.seq)]] += 1 G[[m.start() for m in re.finditer("G", a.seq)]] += 1 T[[m.start() for m in re.finditer("T", a.seq)]] += 1 N[[m.start() for m in re.finditer("N", a.seq)]] += 1 df["A"] = A df["C"] = C df["T"] = T df["G"] = G df["N"] = N df = df.divide(df.sum(axis=1), axis=0) return df
def get_coverage(self, reference_length=None): self.reset() start = [this.reference_start for this in self.data] self.reset() end = [this.reference_end for this in self.data] if reference_length: N = reference_length else: N = max([x for x in end if x]) coverage = np.zeros(N) for x, y in zip(start, end): if y and x >= 0 and y >= 0: coverage[x:y] += 1 else: pass return coverage
def get_coverage(self, reference_length=None): self.reset() start = [this.reference_start for this in self.data] self.reset() end = [this.reference_end for this in self.data] if reference_length: N = reference_length else: N = max([x for x in end if x]) coverage = np.zeros(N) for x, y in zip(start, end): if y and x>=0 and y>=0: coverage[x:y] += 1 else: pass return coverage
def _get_stats(self): filenames, mode = self._get_files("*.json") cols = [ "A", "C", "G", "T", "N", "n_reads", "mean quality", "GC content", "average read length", "total bases" ] N = len(filenames) df = pd.DataFrame(np.zeros((N, 10)), columns=cols) indices = [] for i, filename in enumerate(filenames): if self.tag_R1 in filename: index = "R1" else: index = "R2" if "unmapped" in filename: index += ".unmapped" else: index += ".mapped" indices.append(index) try: # Use a try since the subdf may be empty subdf = pd.read_json(filename) df.iloc[i] = subdf.iloc[0] df.iloc[i]["A"] /= df.iloc[i]["n_reads"] df.iloc[i]["C"] /= df.iloc[i]["n_reads"] df.iloc[i]["G"] /= df.iloc[i]["n_reads"] df.iloc[i]["T"] /= df.iloc[i]["n_reads"] df.iloc[i]["N"] /= df.iloc[i]["n_reads"] except: pass df.index = indices df = df.astype({"n_reads": np.int64, "total bases": np.int64}) return df
def estimate(self, guess=None, k=2): """ :param list guess: a list to provide the initial guess. Order is mu1, sigma1, pi1, mu2, ... :param int k: number of models to be used. """ #print("EM estimation") self.k = k # Initial guess of parameters and initializations if guess is None: # estimate the mu/sigma/pis from the data guess = self.get_guess() mu = np.array(guess[0::3]) sig = np.array(guess[1::3]) pi_ = np.array(guess[2::3]) N_ = len(pi_) gamma = np.zeros((N_, int(self.size))) N_ = np.zeros(N_) p_new = guess # EM loop counter = 0 converged = False self.mus = [] import scipy.stats as ss while not converged: # Compute the responsibility func. and new parameters for k in range(0, self.k): # unstable if eslf.model.pdf is made of zeros #self.model.pdf(self.data, p_new,normalise=False).sum()!=0: gamma[k, :] = pi_[k] * ss.norm.pdf(self.data, mu[k], sig[k]) gamma[k, :] /= (self.model.pdf(self.data, p_new, normalise=False)) """else: gamma[k, :] = pi_[k]*pylab.normpdf(self.data, mu[k], sig[k])/(self.model.pdf(self.data, p_new, normalise=False)+1e-6) """ N_[k] = gamma[k].sum() mu[k] = np.sum(gamma[k] * self.data) / N_[k] sig[k] = pylab.sqrt( np.sum(gamma[k] * (self.data - mu[k])**2) / N_[k]) pi_[k] = N_[k] / self.size self.results = {'x': p_new, 'nfev': counter, 'success': converged} p_new = [] for this in range(self.k): p_new.extend([mu[this], sig[this], pi_[this]]) #p_new = [(mu[x], sig[x], pi_[x]) for x in range(0, self.k)] #p_new = list(pylab.flatten(p_new)) self.status = True try: assert abs(N_.sum() - self.size) / self.size < 1e-6 assert abs(pi_.sum() - 1) < 1e-6 except: print("issue arised at iteration %s" % counter) self.debug = {'N': N_, 'pis': pi_} self.status = False break self.mus.append(mu) # Convergence check counter += 1 converged = counter >= self.max_iter self.gamma = gamma if self.status is True: self.results = {'x': p_new, 'nfev': counter, 'success': converged} self.results = AttrDict(**self.results) self.results.mus = self.results.x[0::3] self.results.sigmas = self.results.x[1::3] self.results.pis = self.results.x[2::3] log_likelihood = self.model.log_likelihood(self.results.x, self.data) self.results.AIC = criteria.AIC(log_likelihood, k, logL=True) self.results.log_likelihood = log_likelihood self.results.AIC = criteria.AIC(log_likelihood, self.k, logL=True) self.results.AICc = criteria.AICc(log_likelihood, self.k, self.data.size, logL=True) self.results.BIC = criteria.BIC(log_likelihood, self.k, self.data.size, logL=True)
def compute_zscore(self, k=2, step=10, use_em=True, verbose=True): """ Compute zscore of coverage and normalized coverage. :param int k: Number gaussian predicted in mixture (default = 2) :param int step: (default = 10). This parameter is used to speed up computation and is ignored if the length of the coverage/sequence is below 100,000 Store the results in the :attr:`df` attribute (dataframe) with a column named *zscore*. .. note:: needs to call :meth:`running_median` before hand. """ # here for lazy import from biokit.stats import mixture # normalize coverage self._coverage_scaling() data = self.df['scale'][self.range[0]:self.range[1]] if len(data) < 100000: step = 1 # remove nan and inf values data = data.replace(0, np.nan) data = data.dropna() if data.empty: data = np.full(len(self.df), 1, dtype=int) self.df['scale'] = data if use_em: self.mixture_fitting = mixture.EM( data[::step]) self.mixture_fitting.estimate(k=k) else: self.mixture_fitting = mixture.GaussianMixtureFitting( data[::step],k=k) self.mixture_fitting.estimate() # keep gaussians informations self.gaussians = self.mixture_fitting.results params_key = ("mus", "sigmas", "pis") self.gaussians_params = [{key[:-1]: self.gaussians[key][i] for key in params_key} for i in range(k)] self.best_gaussian = self._get_best_gaussian() # warning when sigma is equal to 0 if self.best_gaussian["sigma"] == 0: logger.warning("A problem related to gaussian prediction is " "detected. Be careful, Sigma is equal to 0.") self.df["zscore"] = np.zeros(len(self.df), dtype=int) else: self.df["zscore"] = (self.df["scale"] - self.best_gaussian["mu"]) / \ self.best_gaussian["sigma"] # Naive checking that the if k == 2: mus = self.gaussians['mus'] sigmas = self.gaussians["sigmas"] index0 = mus.index(self.best_gaussian["mu"]) if index0 == 0: mu1 = mus[1] s0 = sigmas[0] mu0 = mus[0] else: mu1 = mus[0] s0 = sigmas[1] mu0 = mus[1] if abs(mu0-mu1) < s0: logger.warning(("Warning: k=2 but note that |mu0-mu1| < sigma0. " "k=1 could be a better choice"))