Пример #1
0
def test_damage_model_fit(generate_data):
    g = models.damage_model()
    assert g.fit(x=generate_data[0], p=0.5, pmin=0.01,
                 pmax=0.5).all() == np.array([
                     0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3,
                     0.15471624, 0.15471624, 0.15471624, 0.15471624,
                     0.15471624, 0.08207436, 0.08207436, 0.04575342,
                     0.02759295, 0.01851272, 0.0139726, 0.01170254, 0.01056751,
                     0.01
                 ]).all()
Пример #2
0
def test_damage(ref, bam, mode, wlen, show_al, process, verbose):
    """Prepare data and run LRtest to test for damage

    Args:
        ref (str): name of referene in alignment file
        bam (str): bam file
        mode (str): opening mode of alignment file
        wlen (int): window length
        show_al (bool): Show alignment representations
        process (int): Number of process for parallelization
        verbose (bool): Run in verbose mode
    Returns:
        dict: Dictionary containing LR test results
    """
    al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process)
    try:
        cov = avg_coverage(al_handle.count_coverage(contig=ref))
        nb_reads_aligned = al_handle.count(contig=ref)
        reflen = al_handle.get_reference_length(ref)

        al = al_to_damage(reference=ref, al_handle=al_handle)
        ct_data, ga_data, cc_data, c_data, g_data, all_bases = al.get_damage(
            wlen=wlen, show_al=show_al)
        if ct_data:
            model_A = models.damage_model()
            model_B = models.null_model()
            test_res = fit_models(
                ref=ref,
                model_A=model_A,
                model_B=model_B,
                ct_data=ct_data,
                cc_data=cc_data,
                ga_data=ga_data,
                all_bases=all_bases,
                wlen=wlen,
                verbose=verbose,
            )
            test_res["reference"] = ref
            test_res["nb_reads_aligned"] = nb_reads_aligned
            test_res["coverage"] = cov
            test_res["reflen"] = reflen

            return check_model_fit(test_res, wlen, verbose)

    except (ValueError, RuntimeError) as e:
        if verbose:
            print(f"Model fitting for {ref} failed")
            print(f"Model fitting error: {e}")
            print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov}"
                  " - reflen: {reflen}\n")
        return False
Пример #3
0
def test_damage_model_optim(generate_data):
    g = models.damage_model()
    o, e = optim(function=g.fit,
                 parameters=g.kwds,
                 xdata=generate_data[1],
                 ydata=generate_data[2],
                 bounds=g.bounds,
                 loss='linear')

    target = {
        'p': 0.6039535547658853,
        'pmin': 0.03637474931290336,
        'pmax': 0.4211432052501663
    }
    for k in o:
        assert round(o[k], 3) == round(target[k], 3)
Пример #4
0
def test_damage_group(ct_data, ga_data, cc_data, all_bases, nb_reads_aligned,
                      cov, reflen, wlen, verbose):
    """Performs damage test

    Args:
        ct_data (list of int): List of positions with CtoT transitions
        ga_data (list of int): List of positions with GtoA transitions
        cc_data (list of int): List of positions where C in ref and query
        all_bases (list of int): List of positions where a base is aligned
        nb_reads_aligned(int): number of reads aligned
        cov(float): average coverage across all references
        reflen(int): length of all references
        wlen (int): window length
        verbose(bool): Verbose
    Returns:
        dict: Dictionary containing LR test results
    """
    ref = "reference"
    try:
        if ct_data:
            model_A = models.damage_model()
            model_B = models.null_model()
            test_res = fit_models(
                ref="reference",
                model_A=model_A,
                model_B=model_B,
                ct_data=ct_data,
                cc_data=cc_data,
                ga_data=ga_data,
                all_bases=all_bases,
                wlen=wlen,
                verbose=verbose,
            )
            test_res["reference"] = ref
            test_res["nb_reads_aligned"] = nb_reads_aligned
            test_res["coverage"] = cov
            test_res["reflen"] = reflen

            return check_model_fit(test_res, wlen, verbose)

    except (ValueError, RuntimeError) as e:
        if verbose:
            print(f"Model fitting for {ref} failed")
            print(f"Model fitting error: {e}")
            print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} "
                  "- reflen: {reflen}\n")
        return False
Пример #5
0
def test_damage(ref, bam, mode, wlen, show_al, process, verbose):
    """Prepare data and run LRtest to test for damage

    Args:
        ref (str): name of referene in alignment file
        bam (str): bam file
        mode (str): opening mode of alignment file
        wlen (int): window length
        show_al (bool): Show alignment representations
        process (int): Number of process for parallelization
        verbose (bool): Run in verbose mode
    Returns:
        dict: Dictionary containing LR test results
    """
    al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process)
    try:
        if ref is None:
            all_references = al_handle.references
            cov = np.mean([
                avg_coverage_contig(al_handle.count_coverage(contig=ref))
                for ref in all_references
            ])
            nb_reads_aligned = np.sum(
                [al_handle.count(contig=ref) for ref in all_references])
            reflen = np.sum([
                al_handle.get_reference_length(ref) for ref in all_references
            ])
            refname = "reference"
        else:
            cov = avg_coverage_contig(al_handle.count_coverage(contig=ref))
            nb_reads_aligned = al_handle.count(contig=ref)
            reflen = al_handle.get_reference_length(ref)
            refname = ref

        al = al_to_damage(reference=ref, al_handle=al_handle, wlen=wlen)
        al.get_damage(show_al=show_al)
        (
            mut_count,
            conserved_count,
            CT_damage,
            GA_damage,
            all_damage,
        ) = al.compute_damage()
        # if all_damage:
        model_A = models.damage_model()
        model_B = models.null_model()
        test_res = fit_models(
            ref=ref,
            model_A=model_A,
            model_B=model_B,
            damage=all_damage,
            mut_count=mut_count,
            conserved_count=conserved_count,
            verbose=verbose,
        )
        test_res["reference"] = refname
        test_res["nb_reads_aligned"] = nb_reads_aligned
        test_res["coverage"] = cov
        test_res["reflen"] = reflen

        CT_log = {}
        GA_log = {}

        for i in range(wlen):
            CT_log[f"CtoT-{i}"] = CT_damage[i]
        test_res.update(CT_log)
        test_res.update(GA_log)

        # for i in range(qlen):
        #     if i not in ydata_counts:
        #         ydata_counts[i] = np.nan
        #     if f"CtoT-{i}" not in ctot_out:
        #         ctot_out[f"CtoT-{i}"] = np.nan
        #     if f"GtoA-{i}" not in gtoa_out:
        #         gtoa_out[f"GtoA-{i}"] = np.nan

        # print(test_res)

        return check_model_fit(test_res, wlen, verbose)

    except (ValueError, RuntimeError) as e:
        if verbose:
            print(f"Model fitting for {ref} failed")
            print(f"Model fitting error: {e}")
            print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov}"
                  " - reflen: {reflen}\n")
        return False
Пример #6
0
def damageplot(damage_dict, outdir):
    """Draw pydamage plots

    Args:
        damage_dict(dict): pydamage result dictionary
        qlen(int): query length
        outdir(str): Pydamage result directory
    """
    x = np.array(range(damage_dict["wlen"]))
    qlen = np.array(range(damage_dict["qlen"]))
    y = np.array([damage_dict[i] for i in x])
    c2t = np.array([damage_dict[f"CtoT-{i}"] for i in qlen])
    g2a = np.array([damage_dict[f"GtoA-{i}"] for i in qlen])
    p0 = damage_dict["p0"]
    p0_stdev = damage_dict["p0_stdev"]
    p = damage_dict["p"]
    pmin = damage_dict["pmin"]
    pmin_stdev = damage_dict["pmin_stdev"]
    pmax = damage_dict["pmax"]
    pmax_stdev = damage_dict["pmax_stdev"]
    contig = damage_dict["reference"]
    pvalue = damage_dict["pvalue"]
    coverage = damage_dict["coverage"]
    residuals = damage_dict["residuals"]
    rmse = damage_dict["RMSE"]
    plotdir = outdir

    if pvalue < 0.001:
        rpval = "<0.001"
    else:
        rpval = f"={round(pvalue,3)}"

    m_null = null_model()
    p0_low = max(m_null.bounds[0][0], p0 - 2 * p0_stdev)
    p0_high = min(m_null.bounds[1][0], p0 + 2 * p0_stdev)
    y_unif = m_null.fit(x, p0)
    y_unif_low = np.maximum(np.zeros(y_unif.shape[0]), m_null.fit(x, p0_low))
    y_unif_high = np.minimum(np.ones(y_unif.shape[0]), m_null.fit(x, p0_high))

    geom = damage_model()
    geom_pmin_low = max(geom.bounds[0][1], pmin - 2 * pmin_stdev)
    geom_pmin_high = min(geom.bounds[1][1], pmin + 2 * pmin_stdev)
    geom_pmax_low = max(geom.bounds[0][2], pmax - 2 * pmax_stdev)
    geom_pmax_high = min(geom.bounds[1][2], pmax + 2 * pmax_stdev)

    y_geom = geom.fit(x, p, pmin, pmax)
    y_geom_low = np.maximum(np.zeros(y_geom.shape[0]),
                            geom.fit(x, p, geom_pmin_low, geom_pmax_low))
    y_geom_high = np.minimum(np.ones(y_geom.shape[0]),
                             geom.fit(x, p, geom_pmin_high, geom_pmax_high))

    plt.xticks(rotation=45, fontsize=8)

    fig, ax = plt.subplots()

    ax.plot(x,
            y_unif,
            linewidth=2.5,
            color="DarkOliveGreen",
            alpha=0.8,
            label="Null model")

    ax.fill_between(
        x,
        y_unif_low,
        y_unif_high,
        color="DarkOliveGreen",
        alpha=0.1,
        label="Null Model CI (2 sigma)",
    )

    ax.plot(x,
            y_geom,
            linewidth=2.5,
            color="#D7880F",
            alpha=0.8,
            label="Damage model")

    ax.fill_between(
        x,
        y_geom_low,
        y_geom_high,
        color="#D7880F",
        alpha=0.1,
        label="Damage Model CI (2 sigma)",
    )

    ax.plot(qlen, g2a, color="#236cf5", alpha=0.1, label="G to A transitions")

    ax.plot(qlen, c2t, color="#bd0d45", alpha=0.2, label="C to T transitions")

    ax.set_xlabel("Base from 5'", fontsize=10)
    ax.set_ylabel("Substitution frequency", fontsize=10)
    ax.xaxis.set_ticks(np.arange(qlen[0], qlen[-1], 5))
    ax.set_xticklabels(ax.get_xticks(), rotation=45, fontsize=6)
    ax.set_title(f"coverage: {round(coverage,2)} - pvalue{rpval}", fontsize=8)
    ax.legend(fontsize=8)
    # ax.set_title(f"coverage: {round(coverage,2)} | pvalue{rpval}", fontsize=8)

    left, bottom, width, height = [0.65, 0.3, 0.2, 0.2]
    ax2 = fig.add_axes([left, bottom, width, height])

    fitted = x
    smoothed = lowess(residuals, fitted)
    ax2.scatter(fitted, residuals, marker=".", color="black")
    ax2.plot(smoothed[:, 0], smoothed[:, 1], color="r")
    ax2.plot([min(fitted), max(fitted)], [0, 0],
             color="k",
             linestyle=":",
             alpha=0.3)
    ax2.set_ylabel("Residuals", fontsize=6)
    ax2.set_xlabel("Fitted Values", fontsize=6)
    ax2.set_title(f"Residuals vs. Fitted\nRMSE={round(rmse, 3)}", fontsize=6)
    ax2.xaxis.set_ticks(np.arange(fitted[0], fitted[-1], 5))
    ax2.set_xticklabels([int(i) for i in ax2.get_xticks()],
                        fontsize=6,
                        rotation=45)
    ax2.set_yticklabels([round(i, 3) for i in ax2.get_yticks()], fontsize=6)

    fig.suptitle(contig, fontsize=12, y=0.95)

    fig.savefig(f"{plotdir}/{contig}.png", dpi=200)
Пример #7
0
        'pmin': 0.03637474931290336,
        'pmax': 0.4211432052501663
    }
    for k in o:
        assert round(o[k], 3) == round(target[k], 3)


def test_null_model_fit(generate_data):
    u = models.null_model()
    assert u.fit(x=generate_data[0], p0=0.1).all() == np.array([
        0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
        0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1
    ]).all()


def test_null_model_optim(generate_data):
    u = models.null_model()
    o, e = optim(function=u.fit,
                 parameters=u.kwds,
                 xdata=generate_data[1],
                 ydata=generate_data[2],
                 bounds=u.bounds,
                 loss='linear')
    assert o == {'p0': 0.1000000000000005}


if __name__ == "__main__":
    data, xdata, ydata = generate_data()
    g = models.damage_model()
    o = optim(function=g.fit, parameters=g.kwds, xdata=xdata, ydata=ydata)