def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames) - (bar_width / 2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" % sum(nb_res_ORF)) pylab.bar(np.array(frames) + (bar_width / 2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" % sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def pdf(self, x, params, normalise=True): """Expected parameters are params is a list of gaussian distribution ordered as mu, sigma, pi, mu2, sigma2, pi2, ... """ assert divmod(len(params), 3)[1] == 0 assert len(params) >= 3 * self.k k = len(params) / 3 self.k = k pis = np.array(params[2::3]) if any(np.array(pis) < 0): return 0 if normalise is True: pis /= pis.sum() # !!! sum pi must equal 1 otherwise may diverge badly import scipy.stats as ss data = 0 for i in range(0, int(k)): mu, sigma, pi_ = params[i * 3:(i + 1) * 3] pi_ = pis[i] if sigma != 0: data += pi_ * ss.norm.pdf(x, mu, sigma) return data
def get_percentage_genes_covered_at_this_fraction(self, this): assert this <= 1 and this >= 0 icol = self.coverage_column X = pylab.linspace(0, 1, 101) N = float(len(self.df)) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) return np.interp(this, X, Y)
def moving_average(self, n, circular=False): """Compute moving average of the genome coverage :param n: window's size. Must be odd :param bool circular: is the chromosome circular or not Store the results in the :attr:`df` attribute (dataframe) with a column named *ma*. """ N = len(self.df['cov']) assert n < N/2 from sequana.stats import moving_average ret = np.cumsum(np.array(self.df["cov"]), dtype=float) ret[n:] = ret[n:] - ret[:-n] ma = ret[n - 1:] / n mid = int(n / 2) self.df["ma"] = pd.Series(ma, index=np.arange(start=mid, stop=(len(ma) + mid))) if circular: # FIXME: shift of +-1 as compared to non circular case... # shift the data and compute the moving average self.data = list(self.df['cov'].values[N-n:]) +\ list(self.df['cov'].values) + \ list(self.df['cov'].values[0:n]) ma = moving_average(self.data, n) self.ma = ma[n//2+1:-n//2] self.df["ma"] = pd.Series(self.ma, index=self.df['cov'].index)
def plot_bar(self, spikes_filename=None, ratio=100): data = self.spikes_found(spikes_filename) lengths = [self.SIRV_lengths[x] for x in data.index] data.plot(kind="bar") pylab.plot(np.array(lengths) / ratio) pylab.tight_layout() return data
def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): quality = self.df[[str(x) for x in range(42)]] # not sure why we have phred score from 0 to 41 N = self.metadata['ReadNum'] proba = quality / N self.xmax = 150 xmax = self.xmax + 1 if ax: pylab.sca(ax) # pragma no cover pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3) pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3) pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3) X = [] Q = [] S = [] for pos in range(1, 151): qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()] mean_quality = sum(qualities) / N X.append(pos) Q.append(mean_quality) proba = quality.loc[pos] / N std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)])) S.append(std) print(len(X)) print(len(Q)) print(len(S)) Q = np.array(Q) X = np.array(X) S = np.array(S) pylab.fill_between(X, Q+S, Q-S, color=color, interpolate=False) pylab.plot(X, Q, color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax+1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF)) pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def get_required_coverage(self, M=0.01): """Return the required coverage to ensure the genome is covered A general question is what should be the coverage to make sure that e.g. E=99% of the genome is covered by at least a read. The answer is: .. math:: \log^{-1/(E-1)} This equation is correct but have a limitation due to floating precision. If one provides E=0.99, the answer is 4.6 but we are limited to a maximum coverage of about 36 when one provides E=0.9999999999999999 after which E is rounded to 1 on most computers. Besides, it is no convenient to enter all those numbers. A scientific notation would be better but requires to work with :math:`M=1-E` instead of :math:`E`. .. math:: \log^{-1/ - M} So instead of asking the question what is the requested fold coverage to have 99% of the genome covered, we ask the question what is the requested fold coverage to have 1% of the genome not covered. This allows us to use :math:`M` values as low as 1e-300 that is a fold coverage as high as 690. :param float M: this is the fraction of the genome not covered by any reads (e.g. 0.01 for 1%). See note above. :return: the required fold coverage .. plot:: import pylab from sequana import Coverage cover = Coverage() misses = np.array([1e-1, 1e-2, 1e-3, 1e-4,1e-5,1e-6]) required_coverage = cover.get_required_coverage(misses) pylab.semilogx(misses, required_coverage, 'o-') pylab.ylabel("Required coverage", fontsize=16) pylab.xlabel("Uncovered genome", fontsize=16) pylab.grid() # The inverse equation is required fold coverage = [log(-1/(E - 1))] """ # What should be the fold coverage to have 99% of the genome sequenced ? # It is the same question as equating 1-e^{-(NL/G}) == 0.99, we need NL/G = 4.6 if isinstance(M, float) or isinstance(M, int): assert M < 1 assert M >= 0 else: M = np.array(M) # Here we do not use log(-1/(E-1)) but log(-1/(1-E-1)) to allow # for using float down to 1e-300 since 0.999999999999999 == 1 return np.log(-1 / (-M))
def plot(self, X=[0, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8, .9, .95, .99, .999, 1], fontsize=16, label=None): """plot percentage of genes covered (y axis) as a function of percentage of genes covered at least by X percent (x-axis). """ icol = self.coverage_column N = float(len(self.df)) X = np.array(X) Y = np.array([sum(self.df[icol] > x) / N * 100 for x in X]) if label is None: pylab.plot(X * 100, Y, "o-") else: pylab.plot(X * 100, Y, "o-", label=label) pylab.xlabel("Gene coverage (%)", fontsize=fontsize) pylab.ylabel("Percentage of genes covered", fontsize=fontsize) for this in [25, 50, 75]: pylab.axhline(this, color="r", alpha=0.5, ls="--") pylab.axvline(this, color="r", alpha=0.5, ls="--")
def __init__( self, fold_changes=None, pvalues=None, color="auto", pvalue_threshold=0.05, fold_change_threshold=1, ): """.. rubric:: constructor :param list fold_changes: 1D array or list :param list pvalues: 1D array or list the threshold provided. :param pvalue_threshold: adds an horizontal dashed line at :param fold_change_threshold: colors in grey the absolute fold changes below a given threshold. """ # try to compute the FC now # if self.fold_change is None: # self.fold_change = pylab.log2(X1/X0) # if pvalue is None: # # assume a normal distribution mean 0 and sigma 1 # import scipy.stats # self.pvalue = - pylab.log10(scipy.stats.norm.pdf(abs(self.fold_change), 0,1)), self.fold_changes = np.array(fold_changes) self.pvalues = np.array(pvalues) self.color = color self.pvalue_threshold = pvalue_threshold self.fold_change_threshold = fold_change_threshold assert len(self.fold_changes) == len(self.pvalues) self.df = pd.DataFrame({ "fold_change": self.fold_changes, "pvalue": self.pvalues }) self._get_colors()
def plot_bar_grouped(self, normalise=False, ncol=2, N=None): """ :param normalise: :param ncol: columns in the legend """ if N is not None: N = np.array(N) else: N = np.array([len(x) for x in self.rawdata]) dd = pd.DataFrame(self.sirv).T if normalise: dd = dd / (N / max(N)) dd.columns = self.labels dd.plot(kind="bar") pylab.xlabel("") pylab.legend(self.labels, ncol=ncol) pylab.tight_layout() return dd
def run(self): ## 10% of the time in self.get_data and 90 in cor() if self.df is None: print("call read_align() method to read alignement file") return m = int(self.start / self.binning) M = int(self.stop / self.binning) results = {} # because bins is set to 5, we actually go from m*5 to M*5 X = range(m, M + 1, 1) Xreal = np.array( range(m * self.binning, (M + 1) * self.binning, self.binning)) for chrom in self.chromosomes: #logger.info("Processing {}".format(chrom)) data = self.get_data(chrom) L = len(data) self.scc(data) # shift correlation Y = [self.cor(x) for x in X] results[chrom] = {'data_length': L, 'Y': np.array(Y), 'X': Xreal} # weighted average usng orginal length of the chrmosomes weights = np.array( [results[x]['data_length'] for x in self.chromosomes]) weights = weights / sum(weights) self.results = results self.weights = weights # now the weighted cross correlation df_avc = pd.DataFrame( [w * results[x]['Y'] for w, x in zip(weights, self.chromosomes)]) df_avc = df_avc.T df_avc.index = Xreal return results, df_avc
def __init__(self, data, k=2, method='Nelder-Mead'): """.. rubric:: constructor :param list data: :param int k: number of GMM to use :param str method: minimization method to be used (one of scipy optimise module) """ self.data = np.array(data) self.size = float(len(self.data)) self._k = k self._model = None # initialise the model self.k = k self.verbose = True
def plot_contig_length_vs_nreads(self, fontsize=16): # same as plot_scatter_contig_length_nread_cov if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() pylab.loglog(df.length, df.nread, "o") pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig N reads", fontsize=fontsize) pylab.grid() X = df.query("nread>10 and length>100000")['length'] Y = df.query("nread>10 and length>100000")['nread'] A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) pylab.plot(x, m * x + c, "o-r") pylab.tight_layout()
def generalized_anscombe(x, mu, sigma, gain=1.0): """Compute the generalized anscombe variance stabilizing transform Data should be a mixture of poisson and gaussian noise. The input signal z is assumed to follow the Poisson-Gaussian noise model:: x = gain * p + n where gain is the camera gain and mu and sigma are the read noise mean and standard deviation. X should contain only positive values. Negative values are ignored. Biased for low counts """ try: # If a dataframe, we do not want to change it y = gain * x + (gain**2) * 3.0 / 8.0 + sigma**2 - gain * mu return (2.0 / gain) * np.sqrt(np.maximum(y, 0.0)) except: x = np.array(x) y = gain * x + (gain**2) * 3.0 / 8.0 + sigma**2 - gain * mu return (2.0 / gain) * np.sqrt(np.maximum(y, 0.0))
def plot_scatter_contig_length_nread_cov(self, fontsize=16, vmin=0, vmax=50, min_nreads=20, min_length=50000): if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() # least square X = df.query("nread>@min_nreads and length>@min_length")['length'] Y = df.query("nread>@min_nreads and length>@min_length")['nread'] Z = df.query("nread>@min_nreads and length>@min_length")['covStat'] print(X) print(Y) print(Z) A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) X = df['length'] Y = df['nread'] Z = df['covStat'] pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax) pylab.colorbar() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig reads", fontsize=fontsize) pylab.title("coverage function of contig length and reads used") pylab.grid() pylab.plot(x, m * x + c, "o-r") pylab.loglog() pylab.tight_layout()
def get_data(self, chrname, remove_anomalies=True): # Could be done once for all in read_alignment df = self.df.query('ref==@chrname') # first the fragment position, shifting - strand by fragment length data = np.array([ x if z == '+' else -y for x, y, z in zip(df['start'], df['end'], df['strand']) ]) # sort by absolute position res = pd.DataFrame(data) res.columns = ['x'] res['abs'] = res['x'].abs() res = res.sort_values('abs') del res['abs'] if remove_anomalies: mask = self.remove_anomalies(res) res = res[mask] return res
def anscombe(x): r"""Compute the anscombe variance stabilizing transform. :param x: noisy Poisson-distributed data :return: data with variance approximately equal to 1. Reference: Anscombe, F. J. (1948), "The transformation of Poisson, binomial and negative-binomial data", Biometrika 35 (3-4): 246-254 For Poisson distribution, the mean and variance are not independent. The anscombe transform aims at transforming the data so that the variance is about 1 for large enough mean; For mean zero, the varaince is still zero. So, it transform Poisson data to approximately Gaussian data with mean :math:`\sqrt{x+3/8} - 1/(4m^{1/2})` """ #if np.mean(x) <4: # logger.warning("Mean of input data below 4") try: # If a dataframe, we do not want to change it return 2.0 * np.sqrt(x + 3.0 / 8.0) except: return 2.0 * np.sqrt(np.array(x) + 3.0 / 8.0)
colors = [cmap(i) for i in np.linspace(0, 1, len(list_analysis))] # get results for curves pylab.figure(figsize=(8, 8)) for i in range(len(list_analysis)): analysis = list_analysis[i] res = compute_table_performance(analysis, df_results) print("%s" % analysis) # [TP, FP, FN, TN] # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]])) TP = res[0] FP = res[1] FN = [0] * res[2] TN = [0] * res[3] y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) + [0] * len(TN)) y_scores = np.array(TP + FN + FP + TN) precision, recall, thresholds = precision_recall_curve(y_true, y_scores) pylab.plot(recall, precision, color=colors[i], label=analysis) pylab.xlabel('Recall') pylab.ylabel('Precision') pylab.ylim([0.0, 1.05]) pylab.xlim([0.0, 1.05]) pylab.title('Precision-Recall') #pylab.legend(loc="lower left") lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) #pylab.tight_layout() if file_fig != "show":
def _set_y(self, Y): self._Ytarget = np.array(Y)
def _set_x(self, X): self._Xtarget = np.array(X) self.lower_bound = min(self._Xtarget) self.upper_bound = max(self._Xtarget)
def _add_patches(self, df, method, fill, ax, diagonal=True): width, height = df.shape labels = (df.columns) patches = [] colors = [] for x in range(width): for y in range(height): if fill == 'lower' and x > y: continue elif fill == 'upper' and x < y: continue if diagonal is False and x == y: continue datum = (df.iloc[x, y] + 1.) / 2. d = df.iloc[x, y] d_abs = np.abs(d) #c = self.pvalues[x, y] rotate = -45 if d > 0 else +45 #cmap = self.poscm if d >= 0 else self.negcm if method in ['ellipse', 'square', 'rectangle', 'color']: if method == 'ellipse': func = Ellipse patch = func( (x, y), width=1 * self.shrink, height=(self.shrink - d_abs * self.shrink), angle=rotate) else: func = Rectangle w = h = d_abs * self.shrink #FIXME shring must be <=1 offset = (1 - w) / 2. if method == 'color': w = 1 h = 1 offset = 0 patch = func((x + offset - .5, y + offset - .5), width=w, height=h, angle=0) if self.edgecolor: patch.set_edgecolor(self.edgecolor) #patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') #ax.add_artist(patch) patches.append(patch) #FIXME edgecolor is always printed elif method == 'circle': patch = Circle((x, y), radius=d_abs * self.shrink / 2.) if self.edgecolor: patch.set_edgecolor(self.edgecolor) #patch.set_facecolor(cmap(d_abs)) colors.append(datum) if d_abs > 0.05: patch.set_linestyle('dotted') #ax.add_artist(patch) patches.append(patch) elif method in ['number', 'text']: if d < 0: edgecolor = self.cm(-1.0) elif d >= 0: edgecolor = self.cm(1.0) d_str = "{:.2f}".format(d).replace("0.", ".").replace(".00", "") ax.text(x, y, d_str, color=edgecolor, fontsize=self.fontsize, horizontalalignment='center', weight='bold', alpha=max(0.5, d_abs)) # withdash=False) elif method == 'pie': S = 360 * d_abs patch = [ Wedge((x, y), 1 * self.shrink / 2., -90, S - 90), Wedge((x, y), 1 * self.shrink / 2., S - 90, 360 - 90), ] #patch[0].set_facecolor(cmap(d_abs)) #patch[1].set_facecolor('white') colors.append(datum) colors.append(0.5) if self.edgecolor: patch[0].set_edgecolor(self.edgecolor) patch[1].set_edgecolor(self.edgecolor) #ax.add_artist(patch[0]) #ax.add_artist(patch[1]) patches.append(patch[0]) patches.append(patch[1]) else: raise ValueError( 'Method for the symbols is not known. Use e.g, square, circle' ) if self.binarise_color: colors = [1 if color > 0.5 else -1 for color in colors] if len(patches): col1 = PatchCollection(patches, array=np.array(colors), cmap=self.cm) ax.add_collection(col1) self.collection = col1 # Somehow a release of matplotlib prevent the edge color # from working but the set_edgecolor on the collection itself does # work... if self.edgecolor: self.collection.set_edgecolor(self.edgecolor)
def estimate(self, guess=None, k=None, maxfev=2e4, maxiter=1e3, bounds=None): """guess is a list of parameters as expected by the model guess = {'mus':[1,2], 'sigmas': [0.5, 0.5], 'pis': [0.3, 0.7] } """ if k is not None: self.k = k if guess is None: # estimate the mu/sigma/pis from the data guess = self.get_guess() from scipy.optimize import minimize res = minimize(self.model.log_likelihood, x0=guess, args=(self.data, ), method=self.method, options=dict(maxiter=maxiter, maxfev=maxfev), bounds=bounds) self.results = res pis = np.array(self.results.x[2::3]) self.results.pis_raw = pis.copy() # The ratio may be negative, in which case we need to normalise. # An example would be to have -0.35, -0.15, which normalise would five 0.7, 0.3 as expected. """if sum(pis<0) > 0: unstable = True pis /= pis.sum() if self.verbose: print("Unstable... found negative pis (k=%s)" % self.k) else: unstable = False pis /= pis.sum() """ unstable = False k = len(self.results.x) / 3 params = [] for i in range(0, int(k)): params.append(self.results.x[i * 3]) params.append(self.results.x[(i * 3 + 1)]) params.append(pis[i]) self.results.x = params # FIXME shall we multiply by -1 ?? self.results.log_likelihood = self.model.log_likelihood( params, self.data) if self.results.log_likelihood and unstable is False: self.results.AIC = criteria.AIC(self.results.log_likelihood, self.k, logL=True) self.results.AICc = criteria.AICc(self.results.log_likelihood, self.k, self.data.size, logL=True) self.results.BIC = criteria.BIC(self.results.log_likelihood, self.k, self.data.size, logL=True) else: self.results.AIC = 1000 self.results.AICc = 1000 self.results.BIC = 1000 pis = np.array(self.results.x[2::3]) self.results.pis = list(pis / pis.sum()) self.results.sigmas = self.results.x[1::3] self.results.mus = self.results.x[0::3] return res
def estimate(self, guess=None, k=2): """ :param list guess: a list to provide the initial guess. Order is mu1, sigma1, pi1, mu2, ... :param int k: number of models to be used. """ #print("EM estimation") self.k = k # Initial guess of parameters and initializations if guess is None: # estimate the mu/sigma/pis from the data guess = self.get_guess() mu = np.array(guess[0::3]) sig = np.array(guess[1::3]) pi_ = np.array(guess[2::3]) N_ = len(pi_) gamma = np.zeros((N_, int(self.size))) N_ = np.zeros(N_) p_new = guess # EM loop counter = 0 converged = False self.mus = [] import scipy.stats as ss while not converged: # Compute the responsibility func. and new parameters for k in range(0, self.k): # unstable if eslf.model.pdf is made of zeros #self.model.pdf(self.data, p_new,normalise=False).sum()!=0: gamma[k, :] = pi_[k] * ss.norm.pdf(self.data, mu[k], sig[k]) gamma[k, :] /= (self.model.pdf(self.data, p_new, normalise=False)) """else: gamma[k, :] = pi_[k]*pylab.normpdf(self.data, mu[k], sig[k])/(self.model.pdf(self.data, p_new, normalise=False)+1e-6) """ N_[k] = gamma[k].sum() mu[k] = np.sum(gamma[k] * self.data) / N_[k] sig[k] = pylab.sqrt( np.sum(gamma[k] * (self.data - mu[k])**2) / N_[k]) pi_[k] = N_[k] / self.size self.results = {'x': p_new, 'nfev': counter, 'success': converged} p_new = [] for this in range(self.k): p_new.extend([mu[this], sig[this], pi_[this]]) #p_new = [(mu[x], sig[x], pi_[x]) for x in range(0, self.k)] #p_new = list(pylab.flatten(p_new)) self.status = True try: assert abs(N_.sum() - self.size) / self.size < 1e-6 assert abs(pi_.sum() - 1) < 1e-6 except: print("issue arised at iteration %s" % counter) self.debug = {'N': N_, 'pis': pi_} self.status = False break self.mus.append(mu) # Convergence check counter += 1 converged = counter >= self.max_iter self.gamma = gamma if self.status is True: self.results = {'x': p_new, 'nfev': counter, 'success': converged} self.results = AttrDict(**self.results) self.results.mus = self.results.x[0::3] self.results.sigmas = self.results.x[1::3] self.results.pis = self.results.x[2::3] log_likelihood = self.model.log_likelihood(self.results.x, self.data) self.results.AIC = criteria.AIC(log_likelihood, k, logL=True) self.results.log_likelihood = log_likelihood self.results.AIC = criteria.AIC(log_likelihood, self.k, logL=True) self.results.AICc = criteria.AICc(log_likelihood, self.k, self.data.size, logL=True) self.results.BIC = criteria.BIC(log_likelihood, self.k, self.data.size, logL=True)
def get_df_concordance(self, max_align=-1): """This methods returns a dataframe with Insert, Deletion, Match, Substitution, read length, concordance (see below for a definition) Be aware that the SAM or BAM file must be created using minimap2 and the --cs option to store the CIGAR in a new CS format, which also contains the information about substitution. Other mapper are also handled (e.g. bwa) but the substitution are solely based on the NM tag if it exists. alignment that have no CS tag or CIGAR are ignored. """ from sequana import Cigar count = 0 I, D, M, L, mapq, flags, NM = [], [], [], [], [], [], [] S = [] for i, a in enumerate(self._data): # tags and cigar populated if there is a match # if we use --cs cigar is not populated so we can only look at tags # tags can be an empty list if a.tags is None or len(a.tags) == 0: continue count += 1 mapq.append(a.mapq) L.append(a.qlen) try: NM.append([x[1] for x in a.tags if x[0] == "NM"][0]) except: NM.append(-1) flags.append(a.flag) if 'cs' in dict(a.tags): cs = CS(dict(a.tags)['cs']) S.append(cs['S']) I.append(cs['I']) D.append(cs['D']) M.append(cs['M']) elif a.cigarstring: cigar = Cigar(a.cigarstring).as_dict() I.append(cigar["I"]) D.append(cigar['D']) M.append(cigar['M']) S.append(None) # no info about substitutions in the cigar else: I.append(0) D.append(0) M.append(0) S.append(0) if max_align>0 and count == max_align: break if count % 10000 == 0: logger.debug("Read {} alignments".format(count)) I = np.array(I) D = np.array(D) M = np.array(M) NM = np.array(NM) try: S = np.array(S) C = 1 - (I + D + S)/(S + I + D + M) logger.info("computed Concordance based on minimap2 --cs option") except: logger.info("computed Concordance based on standard CIGAR information using INDEL and NM tag") computed_S = NM - D - I C = 1 - (I + D + computed_S)/(computed_S + I + D + M) df = pd.DataFrame([C, L, I, D, M, mapq, flags, NM, S]) df = df.T df.columns = ["concordance", 'length', "I", "D", "M", "mapq", "flags", "NM", "mismatch"] return df
def plot( self, num=1, cmap=None, colorbar=True, vmin=None, vmax=None, colorbar_position="right", gradient_span="None", figsize=(12, 8), fontsize=None, ): """ Using as input:: df = pd.DataFrame({'A':[1,0,1,1], 'B':[.9,0.1,.6,1], 'C':[.5,.2,0,1], 'D':[.5,.2,0,1]}) we can plot the heatmap + dendogram as follows:: h = Heatmap(df) h.plot(vmin=0, vmax=1.1) .. plot:: :include-source: :width: 80% from sequana.viz import heatmap df = heatmap.get_heatmap_df() h = heatmap.Heatmap(df) h.category_column['A'] = 1 h.category_column['C'] = 1 h.category_column['D'] = 2 h.category_column['B'] = 2 h.plot() """ # save all parameters in a dict layout = {} if cmap is None: cmap = self.params.cmap try: import colormap cmap = colormap.cmap_builder(cmap) except: pass # keep track of row and column names for later. row_header = self.frame.index column_header = self.frame.columns import matplotlib # FIXME something clever for the fontsize if len(row_header) > 100 or len(column_header) > 100: matplotlib.rcParams["font.size"] = 6 if len(row_header) > 50 or len(column_header) > 50: matplotlib.rcParams["font.size"] = 7 if len(row_header) > 30 or len(column_header) > 30: matplotlib.rcParams["font.size"] = 8 else: matplotlib.rcParams["font.size"] = 12 if fontsize: matplotlib.rcParams["font.size"] = fontsize # scaling min/max range self.gradient_span = gradient_span #'only_max' # min_to_max, min_to_max_centered, only_max, only_min if self.gradient_span == "min_to_max_centered": vmax = self.frame.max().max() vmin = self.frame.min().min() vmax = max([vmax, abs(vmin)]) vmin = vmax * -1 if self.gradient_span == "only_max": vmin = 0 vmax = self.frame.max().max() if self.gradient_span == "only_min": vmin = self.frame.min().min() vmax = 0 norm = matplotlib.colors.Normalize(vmin, vmax) # Scale the figure window size # fig = pylab.figure(num=num, figsize=figsize) fig.clf() # LAYOUT -------------------------------------------------- # ax1 (dendrogram 1) on the left of the heatmap [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05, 0.22, 0.2, 0.6] width_between_ax1_axr = 0.004 # distance between the top color bar axis and the matrix height_between_ax1_axc = 0.004 # Sufficient size to show color_bar_w = 0.015 # axr, placement of row side colorbar # second to last controls the width of the side color bar - 0.015 when showing [axr_x, axr_y, axr_w, axr_h] = [0.31, 0.1, color_bar_w, 0.6] axr_x = ax1_x + ax1_w + width_between_ax1_axr axr_y = ax1_y axr_h = ax1_h width_between_axr_axm = 0.004 # axc, placement of column side colorbar # # last one controls the hight of the top color bar - 0.015 when showing [axc_x, axc_y, axc_w, axc_h] = [0.4, 0.63, 0.5, color_bar_w] axc_x = axr_x + axr_w + width_between_axr_axm axc_y = ax1_y + ax1_h + height_between_ax1_axc height_between_axc_ax2 = 0.004 # axm, placement of heatmap for the data matrix # why larger than 1? [axm_x, axm_y, axm_w, axm_h] = [0.4, 0.9, 2.5, 0.5] axm_x = axr_x + axr_w + width_between_axr_axm axm_y = ax1_y axm_h = ax1_h axm_w = axc_w # ax2 (dendrogram 2), on the top of the heatmap # [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3, 0.72, 0.6, 0.15] ax2_x = axr_x + axr_w + width_between_axr_axm ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2 ax2_w = axc_w # axcb - placement of the color legend # if colorbar_position == "top left": [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07, 0.88, 0.18, 0.09] elif colorbar_position == "right": [axcb_x, axcb_y, axcb_w, axcb_h] = [0.85, 0.2, 0.08, 0.6] else: raise ValueError("'top left' or 'right' accepted for now") # COMPUTATION DENDOGRAM 1 ------------------------------------- if self.column_method: Y = self.linkage(self.frame.transpose(), self.column_method, self.column_metric) ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True) # p=30, truncate_mode=None, color_threshold=None, get_leaves=True, # orientation='top labels=None, count_sort=False, distance_sort=False, # show_leaf_counts=True, no_plot=False, no_labels=False, leaf_font_size=None, # leaf_rotation=None, leaf_label_func=None, show_contracted=False, # link_color_func=None, ax=None, above_threshold_color='b', # # color_threshold=0 and above_threshold_color='k' colors all # dendogram into black Z = hierarchy.dendrogram( Y, color_threshold=0, above_threshold_color="k", distance_sort="descending", ) ind2 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]), self.cluster_criterion) ax2.set_xticks([]) ax2.set_yticks([]) # apply the clustering for the array-dendrograms to the actual matrix data idx2 = Z["leaves"] self.frame = self.frame.iloc[:, idx2] # reorder the flat cluster to match the order of the leaves the dendrogram ind2 = ind2[idx2] layout["dendogram2"] = ax2 else: idx2 = range(self.frame.shape[1]) # COMPUTATION DENDOGRAM 2 --------------------------------- if self.row_method: Y = self.linkage(self.frame, self.row_method, self.row_metric) ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True) Z = hierarchy.dendrogram( Y, orientation="right", color_threshold=0, above_threshold_color="k", distance_sort="descending", ) ind1 = hierarchy.fcluster(Y, 0.7 * max(Y[:, 2]), self.cluster_criterion) ax1.set_xticks([]) ax1.set_yticks([]) # apply the clustering for the array-dendrograms to the actual matrix data idx1 = Z["leaves"] self.frame = self.frame.iloc[idx1, :] # reorder the flat cluster to match the order of the leaves the dendrogram ind1 = ind1[idx1] layout["dendogram1"] = ax1 else: idx1 = range(self.frame.shape[0]) # HEATMAP itself axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h]) axm.imshow( self.frame, aspect="auto", origin="lower", interpolation="None", cmap=cmap, norm=norm, ) axm.set_xticks([]) axm.set_yticks([]) layout["heatmap"] = axm # TEXT new_row_header = [] new_column_header = [] for i in range(self.frame.shape[0]): axm.text( self.frame.shape[1] - 0.5, i, " " + str(row_header[idx1[i]]), verticalalignment="center", ) new_row_header.append( row_header[idx1[i]] if self.row_method else row_header[i]) for i in range(self.frame.shape[1]): axm.text( i, -0.9, " " + str(column_header[idx2[i]]), rotation=90, verticalalignment="top", horizontalalignment="center", ) new_column_header.append(column_header[idx2[i]] if self. column_method else column_header[i]) # CATEGORY column ------------------------------ if self.category_column: axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h]) category_col = [ self.category_column[self.df.columns[i]] for i in idx2 ] dc = np.array(category_col, dtype=int) dc.shape = (1, len(ind2)) cmap_c = matplotlib.colors.ListedColormap( self.params.col_side_colors) axc.matshow(dc, aspect="auto", origin="lower", cmap=cmap_c) axc.set_xticks([]) axc.set_yticks([]) layout["category_column"] = axc # CATEGORY row ------------------------------- if self.category_row: axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h]) # self.category_row must be a dictionary with names as found in the columns # of the dataframe. category_row = [self.category_row[self.df.index[i]] for i in idx1] dr = np.array(category_row, dtype=int) dr.shape = (len(category_row), 1) cmap_r = matplotlib.colors.ListedColormap( self.params.col_side_colors) axr.matshow(dr, aspect="auto", origin="lower", cmap=cmap_r) axr.set_xticks([]) axr.set_yticks([]) layout["category_row"] = axr # COLORBAR ---------------------- if colorbar == True: axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False) if colorbar_position == "right": orientation = "vertical" else: orientation = "horizontal" cb = matplotlib.colorbar.ColorbarBase(ax=axcb, cmap=cmap, norm=norm, orientation=orientation) # axcb.set_title("whatever") # max_cb_ticks = 5 # axcb.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(max_cb_ticks)) layout["colorbar"] = cb layout["colorbar_scalablemap"] = axcb # could be useful self.d = {"ordered": self.frame.copy(), "rorder": idx1, "corder": idx2} return layout