Exemplo n.º 1
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    pd_figures = os_join(env["pd-work"], "figures")
    mkdir_p(pd_figures)


    list_run_info = list()

    for gi in tqdm(gil, total=len(gil)):
        # get gms2 and toolp models
        mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi)

        group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper()


        mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None)
        mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None)
        non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"])

        df_gms2 = mm_gms2.pwm_to_df()
        df_toolp = mm_toolp.pwm_to_df()

        fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4))

        # relative
        rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information")
        lm.Logo(rel_mat, color_scheme="classic", ax=axes[0])
        axes[0].set_ylim(*[0, 2])
        axes[0].set_title("GeneMarkS-2")

        # shannon
        sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information")
        lm.Logo(sha_mat, color_scheme="classic", ax=axes[1])
        axes[1].set_ylim(*[0, 2])
        axes[1].set_title("StartLink+")
        plt.tight_layout()
        plt.savefig(next_name(pd_figures))
        plt.show()

        rel_gms2 = relative_entropy(mm_gms2, non_gms2)
        rel_toolp = relative_entropy(mm_toolp, non_gms2)
        gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta"))

        if not args.verified:
            list_run_info.append({
                "GC": gc,
                "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi),
                "RE GMS2": rel_gms2,
                "RE toolp": rel_toolp
            })
        else:
            # verified
            comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group)
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[0],
                "Tool": "GMS2",
                "RE": rel_gms2,
                "GC": gc
                })
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[1],
                "Tool": "GMS2 with SL",
                "RE": rel_toolp,
                "GC": gc
                })

            print(list_run_info[-2:])

    import sbsp_viz.sns as sns
    if args.verified:
        df = pd.DataFrame(list_run_info)
        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"]),
            xlabel="Genome",
            ylabel="Error"))

        sns.lineplot(df, "Genome", "RE", hue="Tool",
                        figure_options=FigureOptions(
                            save_fig=next_name(env["pd-work"]),
                            xlabel="Genome",
                            ylabel="Relative entropy",
                        ))


    else:

        df = pd.DataFrame(list_run_info)
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))


        print("Average Error: {}".format(df["Accuracy"].mean()))

        df = pd.DataFrame(list_run_info)
        df = df[df["Accuracy"] < 2].copy()
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))

        print("Average Error: {}".format(df["Accuracy"].mean()))

        df.to_csv(next_name(env["pd-work"], ext="csv"))
def plot_letter_over_position(env, df, col, title=""):
    # type: (Environment, pd.DataFrame, str, str) -> None

    collect = dict()
    array, update_shifts = create_numpy_for_column_with_extended_motif(
        env, df, col, collect)
    df_original = df
    binned_arrays = [{
        "GC": df["GC"],
        "motifs": array,
        "shifts": update_shifts
    }]

    example = df.at[df.index[0], col]  # type: Dict[str, List[float]]
    w = len(next(iter(example.values())))  # width (numbere of positions)
    b = len(example)  # number of bases (letters)

    letters = example.keys()
    letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))}

    # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all")
    fig = plt.figure(figsize=(10, 12))
    shape = (4, 2)

    ax1 = plt.subplot2grid(shape, (0, 0))
    ax2 = plt.subplot2grid(shape, (0, 1))
    ax3 = plt.subplot2grid(shape, (1, 0))
    ax4 = plt.subplot2grid(shape, (1, 1))
    ax_logo = plt.subplot2grid(shape, (3, 0))
    ax_counts = plt.subplot2grid(shape, (2, 0))
    ax_pos_dist = plt.subplot2grid(shape, (2, 1))
    ax_text = plt.subplot2grid(shape, (3, 1))

    axes = [ax1, ax2, ax3, ax4]

    # for each letter
    # for l, ax in zip(letters, axes.ravel()[:len(letters)]):
    ylim = [-0.1, 1.1]
    for l, ax in zip(letters, axes):
        # for each position in motif
        # go through df and accumulate values
        all_gc = list()
        all_probs = list()
        for w_pos in range(array.shape[1]):

            for ba in binned_arrays:
                arr = ba["motifs"]
                gc = ba["GC"].values
                shifts = ba["shifts"]

                for index in range(len(shifts)):

                    shifted_position = w_pos
                    # print(w_pos, shifted_position)

                    # shifted_pos = w_pos - shifts[index]
                    # if shifted_pos < 0 or shifted_pos >= w:
                    #     continue
                    if w_pos < shifts[index] or w_pos >= shifts[index] + 6:
                        continue

                    all_gc.append(shifted_position)

                    if arr[index, shifted_position,
                           letter_to_idx[l]] < 0 or arr[index,
                                                        shifted_position,
                                                        letter_to_idx[l]] > 1:
                        raise ValueError("Something's up")
                    all_probs.append(arr[index, shifted_position,
                                         letter_to_idx[l]])

            # ax.scatter(all_gc, all_probs, marker="+")
            # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3})
        ax.set_title(f"{l}")

        df = pd.DataFrame({"Position": all_gc, "Probability": all_probs})
        df.sort_values("Position", inplace=True)

        # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax)

        df_mean = df.groupby("Position", as_index=False).mean()
        seaborn.boxplot("Position",
                        "Probability",
                        data=df,
                        ax=ax,
                        color="red",
                        fliersize=0)
        seaborn.lineplot(df_mean["Position"],
                         df_mean["Probability"],
                         ax=ax,
                         color="blue")
        ax.set_ylim(ylim)
        # loess_with_stde(df, "Position", "Probability", ax, None)

        # plt.show()

    # add logo
    ax = ax_logo
    msa_t = collect["msa_t"]
    seqs = [x.seq._data for x in msa_t.list_alignment_sequences]
    counts_mat = lm.alignment_to_matrix(sequences=seqs,
                                        to_type='counts',
                                        characters_to_ignore='.-X')

    # Counts matrix -> Information matrix
    info_mat = lm.transform_matrix(counts_mat,
                                   from_type='counts',
                                   to_type='information')

    lm.Logo(info_mat, ax=ax, color_scheme="classic")
    ax.set_ylim([0, 2])

    # add distplot of starting positions
    ax = ax_counts
    # seaborn.distplot(update_shifts, ax=ax)
    counter = Counter(update_shifts)
    total = sum(counter.values())
    to_add = sorted(set(range(4)).difference(counter.keys()))
    normalized = [[x, 100 * counter[x] / total]
                  for x in counter] + [[x, 0] for x in to_add]
    normalized = np.array(normalized)
    seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue")
    ax.set_ylim([0, 100])
    ax.set_ylabel("Probability")
    ax.set_xlabel("Shift in consensus")

    ### Plot position distribution
    col_pos = col.replace("_MAT", "_POS_DISTR")
    ax = ax_pos_dist
    shift_to_pos_dist = get_position_distributions_by_shift(
        df_original, col_pos, update_shifts)
    for s in sorted(shift_to_pos_dist.keys()):
        list_pos_dist = shift_to_pos_dist[s]

        # average positions
        values = dict()
        for l in list_pos_dist:
            try:
                for i in l.keys():
                    if i not in values.keys():
                        values[i] = list()
                    values[i].append(l[i])
            except Exception:
                continue
        for i in values.keys():
            values[i] = np.mean(values[i])

        total = sum(values.values())
        for i in values.keys():
            values[i] /= total

        x = sorted(values.keys())
        y = [values[a] for a in x]

        seaborn.lineplot(x, y, label=s, ax=ax)

    ax.legend()

    # TEXT
    ax = ax_text
    from matplotlib.font_manager import FontProperties
    fp = FontProperties()
    fp.set_family("monospace")
    print("here")
    print(print_reduced_msa(msa_t, True, n=10))
    ax.text(0,
            0,
            print_reduced_msa(msa_t, True, n=10),
            horizontalalignment='left',
            verticalalignment='center',
            fontproperties=fp)
    ax.set_xlim([-0.2, 0.4])
    ax.set_ylim([-0.4, 0.4])
    # ax.axis("off",)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    plt.suptitle("Gc range: {}. Num Data points: {}".format(
        title, msa_t.number_of_sequences()))
    # save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    plt.savefig(next_name(env["pd-work"]))
    plt.show()
Exemplo n.º 3
0
    i = i + 1

fin.close()
fout.close()

crp_matrix_df = pd.read_csv(main_file_temp2,
                            delim_whitespace=True,
                            index_col=0)  # read csv and convert to dataframe
crp_matrix_df.head()

#### Delete the temporary files ####
os.remove(main_file_temp)
os.remove(main_file_temp2)

prob_mat = logomaker.transform_matrix(crp_matrix_df,
                                      from_type="probability",
                                      to_type="information")
logo = logomaker.Logo(
    prob_mat,
    fade_probabilities=True,  ## will fade the smaller probabilities
    stack_order="small_on_top",
)

final_png = os.path.join(output_location,
                         filename)  # location for saving the file
final_png = final_png + ".png"

axes = plt.gca()  # get current axes of the plots
axes.set_ylim([0, 2])  # set the y-axis limits from 0 to 2

#### Hide the top and the right axes of the plot ####
Exemplo n.º 4
0
def weblogologomaker(request):
    if request.method == "POST":
        # seqs = unquote(request.GET.get('seq'))
        data = request.data
        seqs = data['seqs']
        try:
            type_output = data['output']
        except:
            type_output = "png"
        try:
            type_os = data['os']
        except:
            type_os = "linux"


        ##########################
        # type_os = "windows"
        #######################

        output = weblogo_aux(seqs, type_os)

        in_file = "unaligned.fasta"
        out_file = "aligned.fasta"

        file = open(out_file, "r")
        seqs = read_seq_data(file, alphabet="ACDEFGHIKLMNPQRSTVWY-")
        logodata = LogoData.from_seqs(seqs)
        logooptions = LogoOptions()
        logooptions.title = "VFP WEBSERVER"
        logoformat = LogoFormat(logodata, logooptions)
        weblogo_txt = txt_formatter(logodata, logoformat)

        # weblogo_jpeg = jpeg_formatter(logodata, logoformat)

        weblogo_file = "weblogo.txt"
        weblogo = open(weblogo_file, "w")
        data_weblogo = str(weblogo_txt)[2:len(str(weblogo_txt)) - 1].replace('\\n', '\n').replace('\\t', '\t')
        weblogo.write(data_weblogo)
        weblogo.close()

        filename = 'weblogo.txt'

        weblogoDf = pd.read_csv(filename, skiprows=7, sep='\t')


        weblogoDf = weblogoDf[:-1]

        columns = []
        for i in weblogoDf.columns:
            j = i.replace(' ', '')
            columns.append(j)
        weblogoDf.columns = columns

        weblogo_entropyes = weblogoDf.loc[:, weblogoDf.columns[1:len(weblogoDf.columns) - 4]]

        entropies = list((np.log2(20) - weblogoDf.loc[:, 'Entropy']))

        weblogo_entropyes = weblogo_entropyes.mul(entropies, axis=0)

        family_weblogo = weblogo_entropyes.drop(['-'], axis=1)

        if type_output == "txt":

            weblogo = open(weblogo_file)

            data = weblogo.read()

            return HttpResponse(data, content_type="text/plain")

            # return JsonResponse(family_weblogo.to_json(orient="index"), safe=False)

        else:
            data = logomaker.transform_matrix(family_weblogo)

            # create figure
            height_per_row = 2
            width_per_col = 1.5

            line_size = 25

            num_rows = int(data.shape[0] / line_size) + 1

            fig = plt.figure(figsize=[width_per_col * line_size,
                                      height_per_row * num_rows])

            max_df = data.sum(axis=1).max()

            for i in range(0, int(data.shape[0] / line_size)):
                # set axes limits and label

                ax = plt.subplot2grid((num_rows, 1), (i, 0))
                ax.spines['right'].set_visible(False)
                ax.spines['top'].set_visible(False)
                ax.set_ylim(bottom=0, top=max_df)

                # ax.set_xlabel("Type of peptide")
                ax.set_ylabel('Bits')

                logo = logomaker.Logo(data.loc[range(i * line_size, (i + 1) * line_size), :],
                                      ax=ax,
                                      color_scheme='NajafabadiEtAl2017', )

                # style using Axes methods
                # logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1)
                # logo.ax.xaxis.set_ticks_position('none')
                logo.ax.set_ylim([0, max_df])

                # style using Logo methods
                # logo.style_glyphs(ceiling = max_df)

            if i * line_size != data.shape[0]:

                i += 1

                data_aux = data

                for j in range(i * line_size, (i + 1) * line_size):
                    data_aux = data_aux.append(pd.Series(0, index=data_aux.columns), ignore_index=True)

                ax = plt.subplot2grid((num_rows, 1), (i, 0))

                ax.spines['right'].set_visible(False)
                ax.spines['top'].set_visible(False)
                ax.set_ylim(bottom=0, top=max_df)

                # ax.set_xlabel("Type of peptide")
                ax.set_ylabel('Bits')

                logo = logomaker.Logo(data_aux.loc[range(i * line_size, (i + 1) * line_size), :],
                                      ax=ax,
                                      color_scheme='NajafabadiEtAl2017', )

                # style using Axes methods
                # logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1)
                # logo.ax.xaxis.set_ticks_position('none')
                logo.ax.set_xlim([i * line_size - 0.5, (i + 1) * line_size - 0.5])
                logo.ax.set_ylim([0, max_df])

            image_path = "weblogo.png"
            fig.savefig("weblogo.png")

            with open(image_path, "rb") as image_file:
                image_data = base64.b64encode(image_file.read()).decode('utf-8')

            # base64data = open("base64.txt","w")

            # base64data.write(image_data)

            # print(image_data)

            # base64data = open("base64.txt")

            # send_data = base64data.read()

            # return HttpResponse(image_data, content_type="image/png")

            return HttpResponse(image_data, content_type="text/plain")

        # return JsonResponse({'data': output}, safe=False)
    raise Http404
Exemplo n.º 5
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    df_bac = load_obj(args.pf_data).reset_index()  # type: pd.DataFrame
    df_bac = df_bac[df_bac["GENOME_TYPE"].isin(args.group)]
    min_gc = 20
    max_gc = 70

    if args.motif_type == "PROMOTER":
        df_bac = df_bac[df_bac["GC"] >= 40].copy()

    gc_values = np.arange(min_gc, max_gc, 2)
    models = get_models_by_gc(df_bac, gc_values, motif_type=args.motif_type)

    num_plots = len(models)
    num_rows = int(math.sqrt(num_plots))
    num_cols = math.ceil(num_plots / float(num_rows))

    fig, axes = plt.subplots(num_rows,
                             num_cols,
                             sharex="all",
                             sharey="all",
                             figsize=(12, 10))

    model_index = 0
    for r in range(num_rows):
        for c in range(num_cols):
            if model_index >= len(models):
                break

            if models[model_index] is None:
                model_index += 1
                continue

            bgd = [0.25] * 4
            bgd = background_from_gc(gc_values[model_index])

            newmod = lm.transform_matrix(models[model_index][0],
                                         to_type="information",
                                         from_type="probability",
                                         background=models[model_index][1])
            # from copy import copy
            # newmod = copy(models[model_index][0])
            # for idx in newmod.index:
            #     # see https://bioconductor.org/packages/release/bioc/vignettes/universalmotif/inst/doc/IntroductionToSequenceMotifs.pdf
            #
            #     uncertainty = sum(
            #         [newmod.at[idx, l] * math.log2(newmod.at[idx, l]) for l in newmod.columns]
            #     )
            #     fIC = math.log2(4) - uncertainty
            #     for i, l in enumerate(sorted(newmod.columns)):
            #         newmod.at[idx, l] = max(1 * newmod.at[idx, l] * math.log2(newmod.at[idx, l] / models[model_index][1][i]), 0)
            lm.Logo(newmod, ax=axes[r][c])

            axes[r][c].set_ylim(0, 2)
            axes[r][c].set_title(int(gc_values[model_index]))
            # fig.show()
            model_index += 1

    plt.tight_layout()
    plt.savefig(next_name(env["pd-work"]))
    plt.show()