def test_damage_model_fit(generate_data): g = models.damage_model() assert g.fit(x=generate_data[0], p=0.5, pmin=0.01, pmax=0.5).all() == np.array([ 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.15471624, 0.15471624, 0.15471624, 0.15471624, 0.15471624, 0.08207436, 0.08207436, 0.04575342, 0.02759295, 0.01851272, 0.0139726, 0.01170254, 0.01056751, 0.01 ]).all()
def test_damage(ref, bam, mode, wlen, show_al, process, verbose): """Prepare data and run LRtest to test for damage Args: ref (str): name of referene in alignment file bam (str): bam file mode (str): opening mode of alignment file wlen (int): window length show_al (bool): Show alignment representations process (int): Number of process for parallelization verbose (bool): Run in verbose mode Returns: dict: Dictionary containing LR test results """ al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process) try: cov = avg_coverage(al_handle.count_coverage(contig=ref)) nb_reads_aligned = al_handle.count(contig=ref) reflen = al_handle.get_reference_length(ref) al = al_to_damage(reference=ref, al_handle=al_handle) ct_data, ga_data, cc_data, c_data, g_data, all_bases = al.get_damage( wlen=wlen, show_al=show_al) if ct_data: model_A = models.damage_model() model_B = models.null_model() test_res = fit_models( ref=ref, model_A=model_A, model_B=model_B, ct_data=ct_data, cc_data=cc_data, ga_data=ga_data, all_bases=all_bases, wlen=wlen, verbose=verbose, ) test_res["reference"] = ref test_res["nb_reads_aligned"] = nb_reads_aligned test_res["coverage"] = cov test_res["reflen"] = reflen return check_model_fit(test_res, wlen, verbose) except (ValueError, RuntimeError) as e: if verbose: print(f"Model fitting for {ref} failed") print(f"Model fitting error: {e}") print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov}" " - reflen: {reflen}\n") return False
def test_damage_model_optim(generate_data): g = models.damage_model() o, e = optim(function=g.fit, parameters=g.kwds, xdata=generate_data[1], ydata=generate_data[2], bounds=g.bounds, loss='linear') target = { 'p': 0.6039535547658853, 'pmin': 0.03637474931290336, 'pmax': 0.4211432052501663 } for k in o: assert round(o[k], 3) == round(target[k], 3)
def test_damage_group(ct_data, ga_data, cc_data, all_bases, nb_reads_aligned, cov, reflen, wlen, verbose): """Performs damage test Args: ct_data (list of int): List of positions with CtoT transitions ga_data (list of int): List of positions with GtoA transitions cc_data (list of int): List of positions where C in ref and query all_bases (list of int): List of positions where a base is aligned nb_reads_aligned(int): number of reads aligned cov(float): average coverage across all references reflen(int): length of all references wlen (int): window length verbose(bool): Verbose Returns: dict: Dictionary containing LR test results """ ref = "reference" try: if ct_data: model_A = models.damage_model() model_B = models.null_model() test_res = fit_models( ref="reference", model_A=model_A, model_B=model_B, ct_data=ct_data, cc_data=cc_data, ga_data=ga_data, all_bases=all_bases, wlen=wlen, verbose=verbose, ) test_res["reference"] = ref test_res["nb_reads_aligned"] = nb_reads_aligned test_res["coverage"] = cov test_res["reflen"] = reflen return check_model_fit(test_res, wlen, verbose) except (ValueError, RuntimeError) as e: if verbose: print(f"Model fitting for {ref} failed") print(f"Model fitting error: {e}") print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} " "- reflen: {reflen}\n") return False
def test_damage(ref, bam, mode, wlen, show_al, process, verbose): """Prepare data and run LRtest to test for damage Args: ref (str): name of referene in alignment file bam (str): bam file mode (str): opening mode of alignment file wlen (int): window length show_al (bool): Show alignment representations process (int): Number of process for parallelization verbose (bool): Run in verbose mode Returns: dict: Dictionary containing LR test results """ al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process) try: if ref is None: all_references = al_handle.references cov = np.mean([ avg_coverage_contig(al_handle.count_coverage(contig=ref)) for ref in all_references ]) nb_reads_aligned = np.sum( [al_handle.count(contig=ref) for ref in all_references]) reflen = np.sum([ al_handle.get_reference_length(ref) for ref in all_references ]) refname = "reference" else: cov = avg_coverage_contig(al_handle.count_coverage(contig=ref)) nb_reads_aligned = al_handle.count(contig=ref) reflen = al_handle.get_reference_length(ref) refname = ref al = al_to_damage(reference=ref, al_handle=al_handle, wlen=wlen) al.get_damage(show_al=show_al) ( mut_count, conserved_count, CT_damage, GA_damage, all_damage, ) = al.compute_damage() # if all_damage: model_A = models.damage_model() model_B = models.null_model() test_res = fit_models( ref=ref, model_A=model_A, model_B=model_B, damage=all_damage, mut_count=mut_count, conserved_count=conserved_count, verbose=verbose, ) test_res["reference"] = refname test_res["nb_reads_aligned"] = nb_reads_aligned test_res["coverage"] = cov test_res["reflen"] = reflen CT_log = {} GA_log = {} for i in range(wlen): CT_log[f"CtoT-{i}"] = CT_damage[i] test_res.update(CT_log) test_res.update(GA_log) # for i in range(qlen): # if i not in ydata_counts: # ydata_counts[i] = np.nan # if f"CtoT-{i}" not in ctot_out: # ctot_out[f"CtoT-{i}"] = np.nan # if f"GtoA-{i}" not in gtoa_out: # gtoa_out[f"GtoA-{i}"] = np.nan # print(test_res) return check_model_fit(test_res, wlen, verbose) except (ValueError, RuntimeError) as e: if verbose: print(f"Model fitting for {ref} failed") print(f"Model fitting error: {e}") print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov}" " - reflen: {reflen}\n") return False
def damageplot(damage_dict, outdir): """Draw pydamage plots Args: damage_dict(dict): pydamage result dictionary qlen(int): query length outdir(str): Pydamage result directory """ x = np.array(range(damage_dict["wlen"])) qlen = np.array(range(damage_dict["qlen"])) y = np.array([damage_dict[i] for i in x]) c2t = np.array([damage_dict[f"CtoT-{i}"] for i in qlen]) g2a = np.array([damage_dict[f"GtoA-{i}"] for i in qlen]) p0 = damage_dict["p0"] p0_stdev = damage_dict["p0_stdev"] p = damage_dict["p"] pmin = damage_dict["pmin"] pmin_stdev = damage_dict["pmin_stdev"] pmax = damage_dict["pmax"] pmax_stdev = damage_dict["pmax_stdev"] contig = damage_dict["reference"] pvalue = damage_dict["pvalue"] coverage = damage_dict["coverage"] residuals = damage_dict["residuals"] rmse = damage_dict["RMSE"] plotdir = outdir if pvalue < 0.001: rpval = "<0.001" else: rpval = f"={round(pvalue,3)}" m_null = null_model() p0_low = max(m_null.bounds[0][0], p0 - 2 * p0_stdev) p0_high = min(m_null.bounds[1][0], p0 + 2 * p0_stdev) y_unif = m_null.fit(x, p0) y_unif_low = np.maximum(np.zeros(y_unif.shape[0]), m_null.fit(x, p0_low)) y_unif_high = np.minimum(np.ones(y_unif.shape[0]), m_null.fit(x, p0_high)) geom = damage_model() geom_pmin_low = max(geom.bounds[0][1], pmin - 2 * pmin_stdev) geom_pmin_high = min(geom.bounds[1][1], pmin + 2 * pmin_stdev) geom_pmax_low = max(geom.bounds[0][2], pmax - 2 * pmax_stdev) geom_pmax_high = min(geom.bounds[1][2], pmax + 2 * pmax_stdev) y_geom = geom.fit(x, p, pmin, pmax) y_geom_low = np.maximum(np.zeros(y_geom.shape[0]), geom.fit(x, p, geom_pmin_low, geom_pmax_low)) y_geom_high = np.minimum(np.ones(y_geom.shape[0]), geom.fit(x, p, geom_pmin_high, geom_pmax_high)) plt.xticks(rotation=45, fontsize=8) fig, ax = plt.subplots() ax.plot(x, y_unif, linewidth=2.5, color="DarkOliveGreen", alpha=0.8, label="Null model") ax.fill_between( x, y_unif_low, y_unif_high, color="DarkOliveGreen", alpha=0.1, label="Null Model CI (2 sigma)", ) ax.plot(x, y_geom, linewidth=2.5, color="#D7880F", alpha=0.8, label="Damage model") ax.fill_between( x, y_geom_low, y_geom_high, color="#D7880F", alpha=0.1, label="Damage Model CI (2 sigma)", ) ax.plot(qlen, g2a, color="#236cf5", alpha=0.1, label="G to A transitions") ax.plot(qlen, c2t, color="#bd0d45", alpha=0.2, label="C to T transitions") ax.set_xlabel("Base from 5'", fontsize=10) ax.set_ylabel("Substitution frequency", fontsize=10) ax.xaxis.set_ticks(np.arange(qlen[0], qlen[-1], 5)) ax.set_xticklabels(ax.get_xticks(), rotation=45, fontsize=6) ax.set_title(f"coverage: {round(coverage,2)} - pvalue{rpval}", fontsize=8) ax.legend(fontsize=8) # ax.set_title(f"coverage: {round(coverage,2)} | pvalue{rpval}", fontsize=8) left, bottom, width, height = [0.65, 0.3, 0.2, 0.2] ax2 = fig.add_axes([left, bottom, width, height]) fitted = x smoothed = lowess(residuals, fitted) ax2.scatter(fitted, residuals, marker=".", color="black") ax2.plot(smoothed[:, 0], smoothed[:, 1], color="r") ax2.plot([min(fitted), max(fitted)], [0, 0], color="k", linestyle=":", alpha=0.3) ax2.set_ylabel("Residuals", fontsize=6) ax2.set_xlabel("Fitted Values", fontsize=6) ax2.set_title(f"Residuals vs. Fitted\nRMSE={round(rmse, 3)}", fontsize=6) ax2.xaxis.set_ticks(np.arange(fitted[0], fitted[-1], 5)) ax2.set_xticklabels([int(i) for i in ax2.get_xticks()], fontsize=6, rotation=45) ax2.set_yticklabels([round(i, 3) for i in ax2.get_yticks()], fontsize=6) fig.suptitle(contig, fontsize=12, y=0.95) fig.savefig(f"{plotdir}/{contig}.png", dpi=200)
'pmin': 0.03637474931290336, 'pmax': 0.4211432052501663 } for k in o: assert round(o[k], 3) == round(target[k], 3) def test_null_model_fit(generate_data): u = models.null_model() assert u.fit(x=generate_data[0], p0=0.1).all() == np.array([ 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1 ]).all() def test_null_model_optim(generate_data): u = models.null_model() o, e = optim(function=u.fit, parameters=u.kwds, xdata=generate_data[1], ydata=generate_data[2], bounds=u.bounds, loss='linear') assert o == {'p0': 0.1000000000000005} if __name__ == "__main__": data, xdata, ydata = generate_data() g = models.damage_model() o = optim(function=g.fit, parameters=g.kwds, xdata=xdata, ydata=ydata)