def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"],, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(, "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(, "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))
def plot_letter_over_position(env, df, col, title=""): # type: (Environment, pd.DataFrame, str, str) -> None collect = dict() array, update_shifts = create_numpy_for_column_with_extended_motif( env, df, col, collect) df_original = df binned_arrays = [{ "GC": df["GC"], "motifs": array, "shifts": update_shifts }] example =[df.index[0], col] # type: Dict[str, List[float]] w = len(next(iter(example.values()))) # width (numbere of positions) b = len(example) # number of bases (letters) letters = example.keys() letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))} # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all") fig = plt.figure(figsize=(10, 12)) shape = (4, 2) ax1 = plt.subplot2grid(shape, (0, 0)) ax2 = plt.subplot2grid(shape, (0, 1)) ax3 = plt.subplot2grid(shape, (1, 0)) ax4 = plt.subplot2grid(shape, (1, 1)) ax_logo = plt.subplot2grid(shape, (3, 0)) ax_counts = plt.subplot2grid(shape, (2, 0)) ax_pos_dist = plt.subplot2grid(shape, (2, 1)) ax_text = plt.subplot2grid(shape, (3, 1)) axes = [ax1, ax2, ax3, ax4] # for each letter # for l, ax in zip(letters, axes.ravel()[:len(letters)]): ylim = [-0.1, 1.1] for l, ax in zip(letters, axes): # for each position in motif # go through df and accumulate values all_gc = list() all_probs = list() for w_pos in range(array.shape[1]): for ba in binned_arrays: arr = ba["motifs"] gc = ba["GC"].values shifts = ba["shifts"] for index in range(len(shifts)): shifted_position = w_pos # print(w_pos, shifted_position) # shifted_pos = w_pos - shifts[index] # if shifted_pos < 0 or shifted_pos >= w: # continue if w_pos < shifts[index] or w_pos >= shifts[index] + 6: continue all_gc.append(shifted_position) if arr[index, shifted_position, letter_to_idx[l]] < 0 or arr[index, shifted_position, letter_to_idx[l]] > 1: raise ValueError("Something's up") all_probs.append(arr[index, shifted_position, letter_to_idx[l]]) # ax.scatter(all_gc, all_probs, marker="+") # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3}) ax.set_title(f"{l}") df = pd.DataFrame({"Position": all_gc, "Probability": all_probs}) df.sort_values("Position", inplace=True) # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax) df_mean = df.groupby("Position", as_index=False).mean() seaborn.boxplot("Position", "Probability", data=df, ax=ax, color="red", fliersize=0) seaborn.lineplot(df_mean["Position"], df_mean["Probability"], ax=ax, color="blue") ax.set_ylim(ylim) # loess_with_stde(df, "Position", "Probability", ax, None) # # add logo ax = ax_logo msa_t = collect["msa_t"] seqs = [x.seq._data for x in msa_t.list_alignment_sequences] counts_mat = lm.alignment_to_matrix(sequences=seqs, to_type='counts', characters_to_ignore='.-X') # Counts matrix -> Information matrix info_mat = lm.transform_matrix(counts_mat, from_type='counts', to_type='information') lm.Logo(info_mat, ax=ax, color_scheme="classic") ax.set_ylim([0, 2]) # add distplot of starting positions ax = ax_counts # seaborn.distplot(update_shifts, ax=ax) counter = Counter(update_shifts) total = sum(counter.values()) to_add = sorted(set(range(4)).difference(counter.keys())) normalized = [[x, 100 * counter[x] / total] for x in counter] + [[x, 0] for x in to_add] normalized = np.array(normalized) seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue") ax.set_ylim([0, 100]) ax.set_ylabel("Probability") ax.set_xlabel("Shift in consensus") ### Plot position distribution col_pos = col.replace("_MAT", "_POS_DISTR") ax = ax_pos_dist shift_to_pos_dist = get_position_distributions_by_shift( df_original, col_pos, update_shifts) for s in sorted(shift_to_pos_dist.keys()): list_pos_dist = shift_to_pos_dist[s] # average positions values = dict() for l in list_pos_dist: try: for i in l.keys(): if i not in values.keys(): values[i] = list() values[i].append(l[i]) except Exception: continue for i in values.keys(): values[i] = np.mean(values[i]) total = sum(values.values()) for i in values.keys(): values[i] /= total x = sorted(values.keys()) y = [values[a] for a in x] seaborn.lineplot(x, y, label=s, ax=ax) ax.legend() # TEXT ax = ax_text from matplotlib.font_manager import FontProperties fp = FontProperties() fp.set_family("monospace") print("here") print(print_reduced_msa(msa_t, True, n=10)) ax.text(0, 0, print_reduced_msa(msa_t, True, n=10), horizontalalignment='left', verticalalignment='center', fontproperties=fp) ax.set_xlim([-0.2, 0.4]) ax.set_ylim([-0.4, 0.4]) # ax.axis("off",) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.suptitle("Gc range: {}. Num Data points: {}".format( title, msa_t.number_of_sequences())) # save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.savefig(next_name(env["pd-work"]))
i = i + 1 fin.close() fout.close() crp_matrix_df = pd.read_csv(main_file_temp2, delim_whitespace=True, index_col=0) # read csv and convert to dataframe crp_matrix_df.head() #### Delete the temporary files #### os.remove(main_file_temp) os.remove(main_file_temp2) prob_mat = logomaker.transform_matrix(crp_matrix_df, from_type="probability", to_type="information") logo = logomaker.Logo( prob_mat, fade_probabilities=True, ## will fade the smaller probabilities stack_order="small_on_top", ) final_png = os.path.join(output_location, filename) # location for saving the file final_png = final_png + ".png" axes = plt.gca() # get current axes of the plots axes.set_ylim([0, 2]) # set the y-axis limits from 0 to 2 #### Hide the top and the right axes of the plot ####
def weblogologomaker(request): if request.method == "POST": # seqs = unquote(request.GET.get('seq')) data = seqs = data['seqs'] try: type_output = data['output'] except: type_output = "png" try: type_os = data['os'] except: type_os = "linux" ########################## # type_os = "windows" ####################### output = weblogo_aux(seqs, type_os) in_file = "unaligned.fasta" out_file = "aligned.fasta" file = open(out_file, "r") seqs = read_seq_data(file, alphabet="ACDEFGHIKLMNPQRSTVWY-") logodata = LogoData.from_seqs(seqs) logooptions = LogoOptions() logooptions.title = "VFP WEBSERVER" logoformat = LogoFormat(logodata, logooptions) weblogo_txt = txt_formatter(logodata, logoformat) # weblogo_jpeg = jpeg_formatter(logodata, logoformat) weblogo_file = "weblogo.txt" weblogo = open(weblogo_file, "w") data_weblogo = str(weblogo_txt)[2:len(str(weblogo_txt)) - 1].replace('\\n', '\n').replace('\\t', '\t') weblogo.write(data_weblogo) weblogo.close() filename = 'weblogo.txt' weblogoDf = pd.read_csv(filename, skiprows=7, sep='\t') weblogoDf = weblogoDf[:-1] columns = [] for i in weblogoDf.columns: j = i.replace(' ', '') columns.append(j) weblogoDf.columns = columns weblogo_entropyes = weblogoDf.loc[:, weblogoDf.columns[1:len(weblogoDf.columns) - 4]] entropies = list((np.log2(20) - weblogoDf.loc[:, 'Entropy'])) weblogo_entropyes = weblogo_entropyes.mul(entropies, axis=0) family_weblogo = weblogo_entropyes.drop(['-'], axis=1) if type_output == "txt": weblogo = open(weblogo_file) data = return HttpResponse(data, content_type="text/plain") # return JsonResponse(family_weblogo.to_json(orient="index"), safe=False) else: data = logomaker.transform_matrix(family_weblogo) # create figure height_per_row = 2 width_per_col = 1.5 line_size = 25 num_rows = int(data.shape[0] / line_size) + 1 fig = plt.figure(figsize=[width_per_col * line_size, height_per_row * num_rows]) max_df = data.sum(axis=1).max() for i in range(0, int(data.shape[0] / line_size)): # set axes limits and label ax = plt.subplot2grid((num_rows, 1), (i, 0)) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_ylim(bottom=0, top=max_df) # ax.set_xlabel("Type of peptide") ax.set_ylabel('Bits') logo = logomaker.Logo(data.loc[range(i * line_size, (i + 1) * line_size), :], ax=ax, color_scheme='NajafabadiEtAl2017', ) # style using Axes methods #"$-\Delta \Delta G$ (kcal/mol)", labelpad=-1) #'none')[0, max_df]) # style using Logo methods # logo.style_glyphs(ceiling = max_df) if i * line_size != data.shape[0]: i += 1 data_aux = data for j in range(i * line_size, (i + 1) * line_size): data_aux = data_aux.append(pd.Series(0, index=data_aux.columns), ignore_index=True) ax = plt.subplot2grid((num_rows, 1), (i, 0)) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_ylim(bottom=0, top=max_df) # ax.set_xlabel("Type of peptide") ax.set_ylabel('Bits') logo = logomaker.Logo(data_aux.loc[range(i * line_size, (i + 1) * line_size), :], ax=ax, color_scheme='NajafabadiEtAl2017', ) # style using Axes methods #"$-\Delta \Delta G$ (kcal/mol)", labelpad=-1) #'none')[i * line_size - 0.5, (i + 1) * line_size - 0.5])[0, max_df]) image_path = "weblogo.png" fig.savefig("weblogo.png") with open(image_path, "rb") as image_file: image_data = base64.b64encode('utf-8') # base64data = open("base64.txt","w") # base64data.write(image_data) # print(image_data) # base64data = open("base64.txt") # send_data = # return HttpResponse(image_data, content_type="image/png") return HttpResponse(image_data, content_type="text/plain") # return JsonResponse({'data': output}, safe=False) raise Http404
def main(env, args): # type: (Environment, argparse.Namespace) -> None df_bac = load_obj(args.pf_data).reset_index() # type: pd.DataFrame df_bac = df_bac[df_bac["GENOME_TYPE"].isin(] min_gc = 20 max_gc = 70 if args.motif_type == "PROMOTER": df_bac = df_bac[df_bac["GC"] >= 40].copy() gc_values = np.arange(min_gc, max_gc, 2) models = get_models_by_gc(df_bac, gc_values, motif_type=args.motif_type) num_plots = len(models) num_rows = int(math.sqrt(num_plots)) num_cols = math.ceil(num_plots / float(num_rows)) fig, axes = plt.subplots(num_rows, num_cols, sharex="all", sharey="all", figsize=(12, 10)) model_index = 0 for r in range(num_rows): for c in range(num_cols): if model_index >= len(models): break if models[model_index] is None: model_index += 1 continue bgd = [0.25] * 4 bgd = background_from_gc(gc_values[model_index]) newmod = lm.transform_matrix(models[model_index][0], to_type="information", from_type="probability", background=models[model_index][1]) # from copy import copy # newmod = copy(models[model_index][0]) # for idx in newmod.index: # # see # # uncertainty = sum( # [[idx, l] * math.log2([idx, l]) for l in newmod.columns] # ) # fIC = math.log2(4) - uncertainty # for i, l in enumerate(sorted(newmod.columns)): #[idx, l] = max(1 *[idx, l] * math.log2([idx, l] / models[model_index][1][i]), 0) lm.Logo(newmod, ax=axes[r][c]) axes[r][c].set_ylim(0, 2) axes[r][c].set_title(int(gc_values[model_index])) # model_index += 1 plt.tight_layout() plt.savefig(next_name(env["pd-work"]))