def visualize(prim_str, pred_string, true_string, save_imgs = False, suffix_img = ''): corr_string = balance_op_tmp(pred_string) print(f'pred: {pred_string}') print(f"true: {true_string}") print(f'corr: {corr_string}') with open('tmp_vis_pred.txt', 'w') as f: f.write(prim_str+ os.linesep) f.write(corr_string+ os.linesep) f.close with open('tmp_vis_true.txt', 'w') as f: f.write(prim_str+ os.linesep) f.write(true_string+os.linesep) f.close plt.figure(figsize = (20, 20)) plt.title('predicted') cg = forgi.load_rna('tmp_vis_pred.txt', allow_many=False) fvm.plot_rna(cg, text_kwargs={"fontweight":"black"}, lighten=0.7,backbone_kwargs={"linewidth":3}) if save_imgs: plt.savefig(f'pred_{suffix_img}.jpg') plt.show() plt.figure(figsize = (20, 20)) plt.title('original') cg = forgi.load_rna('tmp_vis_true.txt', allow_many=False) fvm.plot_rna(cg, text_kwargs={"fontweight":"black"}, lighten=0.7,backbone_kwargs={"linewidth":3}) plt.savefig(f'true_{suffix_img}.jpg') plt.show()
def draw_2nd_structure(srna): for i in range(3): name = srna.loc[i, "name"] sequence = srna.loc[i, "sequence"] folding = fold_rna(srna.loc[i, "sequence"])[0] print("##############", name, "###############") print(sequence) print(folding) print() fx_test = ">{0}\n{1}\n{2}\n".format(name, sequence, folding) textfile = open('./resources/' + srna.loc[i, "name"] + '.fx', "w") textfile.write(fx_test) textfile.close() for i in range(3): ##Print Structure plt.figure(figsize=(20, 20)) cg = forgi.load_rna('./resources/' + srna.loc[i, "name"] + '.fx', allow_many=False) fvm.plot_rna(cg, text_kwargs={"fontweight": "black"}, lighten=0.7, backbone_kwargs={"linewidth": 3}) plt.show() # plt.savefig(srna.loc[i, "name"]+'.png') return srna
def create_pairing_matrix(seq, num_backtrack=10): # create fold compound object fc = RNA.fold_compound(seq, md) # compute MFE (ss, mfe) = fc.mfe() # rescale Boltzmann factors according to MFE fc.exp_params_rescale(mfe) # compute partition function to fill DP matrices fc.pf() structures = [] pair_tables = [] pair_matrices = [] for s in fc.pbacktrack(num_backtrack): structures.append(s) bg, = forgi.load_rna(s) pair_tables.append(bg.to_pair_table()) for pair_idx in pair_tables: pair_matrix = np.eye(len(seq)) pair_idx = np.array(pair_idx[1:]) - 1 for i, j in enumerate(pair_idx): if j == -1: # unpaired continue else: pair_matrix[i, j] += 1 pair_matrix = pair_matrix / np.sum(pair_matrix, axis=-1, keepdims=True) pair_matrices.append(pair_matrix) pair_matrix = np.mean(pair_matrices, axis=0) return pair_matrix
def load(self, pattern): num_loaded = 0 num_selected = 0 if not isinstance(pattern, list): pattern = [pattern] fns = [] for pat in pattern: fns.extend(glob(pat)) if len(fns) > 1000: lev = logging.WARNING else: lev = logging.INFO log.log(lev, "Loading %s files", len(fns)) with _LoggingContext(logging.getLogger(), logging.CRITICAL): for fn in fns: num_selected += 1 if fn not in self.cgs: cgs = forgi.load_rna(fn) if len(cgs) != 1: raise ValueError( "Expected 1 RNA component in file {}, found {}:{}". format(fn, len(cgs)), [cg.name for cg in cgs]) cg, = cgs cg.infos["filename"] = fn self.cgs[fn] = cg num_loaded += 1 if fn not in self.pdds: points = [] try: pd_pdd = pd.read_csv(fn + ".pdd.csv") except: for i in range(1, cg.seq_length + 1): points.append( cg.get_virtual_residue( i, allow_single_stranded=True)) x, y = ftuv.pair_distance_distribution( points, stepsize=self.stepsize) df = pd.DataFrame({"step": x, "count": y}) df.to_csv(fn + ".pdd.csv") else: x = pd_pdd["step"] y = pd_pdd["count"] self.pdds[fn] = (x, y) scores = self.get_scores(pattern) if scores: minsc = scores[0] maxsc = scores[-1] else: minsc = None maxsc = None return num_selected, num_loaded, minsc, maxsc
def structure(sequence, dot_bracket, mirna_name, index): out_dir = index + "/mirna" with open(out_dir + "/" + mirna_name + ".db", "w") as temp_file: temp_file.write(sequence + '\n') temp_file.write(dot_bracket) cg = forgi.load_rna(out_dir + "/" + mirna_name + ".db", allow_many=False) fvm.plot_rna(cg, text_kwargs={"fontweight": "black"}, lighten=0.7, backbone_kwargs={"linewidth": 3}) plt.savefig("gui/src/assets/" + mirna_name + ".png")
def create_multiclass_vector(seq, num_backtrack=10): # create fold compound object fc = RNA.fold_compound(seq, md) # compute MFE (ss, mfe) = fc.mfe() # rescale Boltzmann factors according to MFE fc.exp_params_rescale(mfe) # compute partition function to fill DP matrices fc.pf() structures = [] dict_map = {'s': 'P', 'i': 'I', 'h': 'H', 'm': 'M', 'f': 'E', 't': 'E'} one_hot_vector = np.zeros((len(seq), 5)) for s in fc.pbacktrack(num_backtrack): structures.append(s) bg, = forgi.load_rna(s) element_string = bg.to_element_string() element_string = ''.join(map(lambda x: dict_map[x], element_string)) one_hot_vector += one_hot_encode_struct(element_string, method='signed') one_hot_vector /= 1.0 * num_backtrack return one_hot_vector
def plot_rna_struct(seq, struct, ax=None, offset=(0, 0), text_kwargs={}, backbone_kwargs={}, basepair_kwargs={}, highlight_bp_idx=[], highlight_nt_idx=[], lighten=0.7, saveto='tmp.png'): with open('tmp.fa', 'w') as file: file.write('>tmp\n%s\n%s' % (seq, struct)) cg = forgi.load_rna('tmp.fa', allow_many=False) RNA.cvar.rna_plot_type = 1 fig = plt.figure(figsize=(30, 30)) coords = [] bp_string = cg.to_dotbracket_string() if ax is None: ax = plt.gca() if offset is None: offset = (0, 0) elif offset is True: offset = (ax.get_xlim()[1], ax.get_ylim()[1]) else: pass vrna_coords = RNA.get_xy_coordinates(bp_string) # TODO Add option to rotate the plot for i, _ in enumerate(bp_string): coord = (offset[0] + vrna_coords.get(i).X, offset[1] + vrna_coords.get(i).Y) coords.append(coord) coords = np.array(coords) # First plot backbone bkwargs = {"color": "grey", "zorder": 0, "linewidth": 0.5} bkwargs.update(backbone_kwargs) ax.plot(coords[:, 0], coords[:, 1], **bkwargs) # Now plot basepairs basepairs_hl, basepairs_nonhl = [], [] for s in cg.stem_iterator(): for p1, p2 in cg.stem_bp_iterator(s): if (p1 - 1, p2 - 1) in highlight_bp_idx: basepairs_hl.append([coords[p1 - 1], coords[p2 - 1]]) else: basepairs_nonhl.append([coords[p1 - 1], coords[p2 - 1]]) if len(basepairs_hl) > 0: basepairs_hl = np.array(basepairs_hl) bpkwargs_hl = {"color": 'red', "zorder": 0, "linewidth": 3} bpkwargs_hl.update(basepair_kwargs) ax.plot(basepairs_hl[:, :, 0].T, basepairs_hl[:, :, 1].T, **bpkwargs_hl) if len(basepairs_nonhl) > 0: basepairs_nonhl = np.array(basepairs_nonhl) bpkwargs_nonhl = {"color": 'black', "zorder": 0, "linewidth": 0.5} bpkwargs_nonhl.update(basepair_kwargs) ax.plot(basepairs_nonhl[:, :, 0].T, basepairs_nonhl[:, :, 1].T, **bpkwargs_nonhl) # Now plot circles for i, coord in enumerate(coords): if i in highlight_nt_idx: c = 'green' h, l, s = colorsys.rgb_to_hls(*mc.to_rgb(c)) if lighten > 0: l += (1 - l) * min(1, lighten) else: l += l * max(-1, lighten) c = colorsys.hls_to_rgb(h, l, s) circle = plt.Circle((coord[0], coord[1]), edgecolor="black", facecolor=c) else: circle = plt.Circle((coord[0], coord[1]), edgecolor="black", facecolor="white") ax.add_artist(circle) if cg.seq: if "fontweight" not in text_kwargs: text_kwargs["fontweight"] = "bold" ax.annotate(cg.seq[i + 1], xy=coord, ha="center", va="center", **text_kwargs) all_coords = list(coords) ntnum_kwargs = {"color": "gray"} ntnum_kwargs.update(text_kwargs) for nt in range(10, cg.seq_length, 10): # We try different angles annot_pos = _find_annot_pos_on_circle(nt, all_coords, cg) if annot_pos is not None: ax.annotate(str(nt), xy=coords[nt - 1], xytext=annot_pos, arrowprops={ "width": 1, "headwidth": 1, "color": "gray" }, ha="center", va="center", zorder=0, **ntnum_kwargs) all_coords.append(annot_pos) datalim = ((min(list(coords[:, 0]) + [ax.get_xlim()[0]]), min(list(coords[:, 1]) + [ax.get_ylim()[0]])), (max(list(coords[:, 0]) + [ax.get_xlim()[1]]), max(list(coords[:, 1]) + [ax.get_ylim()[1]]))) ax.set_aspect('equal', 'datalim') ax.update_datalim(datalim) ax.autoscale_view() ax.set_axis_off() plt.savefig(saveto, dpi=350) plt.close(fig)
import RNA import forgi import numpy as np import pandas as pd import pickle count = 0 to_pickle = [] pos_dotbracket = open('pos_dotbracket_FXR1.txt', 'r') for dotbracket in pos_dotbracket: source = [] target = [] bg, = forgi.load_rna(dotbracket.rstrip()) pt = bg.to_pair_table()[1:] #am = np.zeros((100,100)) for i in range(len(pt)): if i + 1 < len(pt): source.append(i) target.append(i + 1) if i - 1 > -1: source.append(i) target.append(i - 1) if pt[i] != 0: source.append(i) target.append(pt[i] - 1) source = np.array(source) target = np.array(target) to_pickle.append(np.stack((source, target))) if count % 1000 == 0: print(count) count += 1
# print(row[0]) seq = row[2] # print('seq') # print(seq) dotbracket = row[14] # print("dotbracket:") # print(dotbracket) if len(seq) != len(dotbracket): print('Different Lengths, failed:') print(row) continue tmp_file = open("tmp.txt", "w") tmp_file.write(seq + '\n') tmp_file.write(dotbracket + '\n') tmp_file.close() cg = forgi.load_rna('tmp.txt', allow_many=False) result = fgb.BulgeGraph.to_element_string(cg, with_numbers=True) result = result.splitlines() row.append(str(result[0])) row.append(str(result[1])) writeCSV.writerow(row) # all_col.append(row) # writeCSV.writerows(all_col) # bg = fgb.BulgeGraph.from_dotbracket('((..))..((..))') # rna_seq_dotb = ['CGCUUCAUAUAAUCCUAAUGAUAUGGUUUGGGAGUUUCUACCAAGAGCCUUAAACUCUUGAUUAUGAAGUU', '((((((((((..(((((((.......)))))))......).(((.((.......))))))..)))))))).'] # tmp_file = open("tmp.txt","w") # tmp_file.write(rna_seq_dotb[0]+ '\n') # tmp_file.write(rna_seq_dotb[1]+ '\n') # tmp_file.writelines(rna_seq_dotb)
def search_dot_pair_stem(args): dot_dict = util.read_dot(dot=args.dot) import forgi d = dot_dict['dotbracket'] s = '...(((((...[[[.))))).((((((((((.(((((((((.....(((.(((..((...(((....((..........))...)))))......(((......((((..((..((....(((..................((((....(((((((.....))))))).....)))).......((((...((((((....))))))...))))....((((((.......(((((.((((...((((.((((((((....))))))))..)))).)))).....)))))......))))))...........((((.((((......))))))))....)))...))))..))))(((..(.(((....((((((((.......))))))))))).....))))...((((((((....))))...))))))).((((((..........)))))).((((....))))...)))))).).....(.(((...(((((...))))).)))).)).))))))....(((((((((((((....))).))))))).)))......(((.(((.......)))).)).........(((((((((....[[[[.....[[.)))]].......]]]])))))).))))))))))..........(((((.....((((...(((.......(((.(((((((((((((.((((....))))....))))))))..)))))))).......((((.(((((...(((((((......)))))))....)))))))))................................................................................................................................(((((((((..(((((((((..((((((((...(((......)))......))))))))..))....(..((....)))))))))).))))).))))...)))...))))....((((((...((...((((.........))))...))))))))..........[[[[[[.(((..((((((((.(((((....)))))))))))))..)))...[[..))]]...]]]....]]].)))..(((.....((((....))))....)))...]]]..(((((.(((((((..((..(((((((((((((((((....((((........))))........(((((((....(((((........((((((........))))))......)))))...((.((((..(((((((((...(((((((((....)))..((((......))))..)))))).....((((.(((.((((..((((....(((..((((....)))).)))....))))..)))))))..((((((((.....))))))))....))))...)))).)))...).))))))).....)))))))...)).))))))))))...(((((((.....(((.......((..((((....))))..)).....))).....)))))))......(...((((((((........))))))))...).....))))).....((((((((.......))))))))......))...)))))))))).))....((.((.(.((((((((.((.((((((((((((..(((((((((((((((.((((((((((((.....))))))))))))...)))))))))))))))..))))))))))))).)))))))))..).))..))....((((((((((....))))))))))........' bg, = forgi.load_rna(s) pair_dict = nested_dict(2, list) reg_str = "[\(\)]{" + str(args.min_len) + ",}" # for n,match in enumerate(re.finditer(reg_str, dot_dict['dotbracket'])): for n, match in enumerate(re.finditer(reg_str, s)): # if n >= 3: continue start = match.span()[0] # 0-based end = match.span()[1] # # 0-based, not include start_pair = bg.pairing_partner(start + 1) # 1-based end_pair = bg.pairing_partner(end) # 1-based pos_ls = [start, end, start_pair, end_pair] # print(pos_ls) if None in pos_ls: continue middle = int((max(pos_ls) + min(pos_ls)) / 2) mask_len = max(pos_ls) - min(pos_ls) if mask_len > 100: continue fragment_start = middle - 50 fragment_end = middle + 50 if fragment_start < 0: continue if fragment_end > len(dot_dict['dotbracket']): continue print(n, match.start(), match.span(), match.group(), start_pair, end_pair, middle, fragment_start, fragment_end) if (end - start) != (abs(start_pair - end_pair) + 1): continue pair_start = min(start_pair, end_pair) - 1 pair_end = max(start_pair, end_pair) if end < pair_start: gap_start = end gap_end = pair_start else: gap_start = pair_end gap_end = start gap_dot = s[pair_end:start] if len(set(gap_dot)) != 1: continue pair_dict[n]['gap_start'] = gap_start pair_dict[n]['gap_end'] = gap_end pair_dict[n]['stem_start'] = start pair_dict[n]['stem_end'] = end pair_dict[n]['pair_start'] = pair_start pair_dict[n]['pair_end'] = pair_end pair_dict[n]['fragment_start'] = fragment_start pair_dict[n]['fragment_end'] = fragment_end generate_mask_region_validate( shape_out=args.shape_out, tx=args.tx, species=args.species, mask_start=start, mask_end=end, fragment_start=fragment_start, fragment_end=fragment_end, savefn_dir= '/home/gongjing/project/shape_imputation/data/hek_wc_vivo_rRNA/3.shape/mask_specific_regions/ss_pair', plot_gradient=1) generate_mask_region_validate( shape_out=args.shape_out, tx=args.tx, species=args.species, mask_start=pair_start, mask_end=pair_end, fragment_start=fragment_start, fragment_end=fragment_end, savefn_dir= '/home/gongjing/project/shape_imputation/data/hek_wc_vivo_rRNA/3.shape/mask_specific_regions/ss_pair', plot_gradient=1) pair_df = pd.DataFrame.from_dict(pair_dict, orient='index') savefn = '/home/gongjing/project/shape_imputation/data/hek_wc_vivo_rRNA/3.shape/mask_specific_regions/ss_pair/stem_pair.txt' pair_df.to_csv(savefn, header=True, index=True, sep='\t')