def bed_meta2(bed_ls, bed_label_ls, icshape_out_ls, savefn, extend=20, species='human'): fa_dict = util.read_fa(fa=None, species=species, pureID=1) fig, ax = plt.subplots(figsize=(16, 8)) for bed, bed_label, icshape_out in zip(bed_ls, bed_label_ls, icshape_out_ls): print(bed, bed_label, icshape_out) region_mean, entry_num, df = single_bed_meta(bed, icshape_out, extend, savefn, bed_label, fa_dict) print(region_mean) print(df.head()) ax.plot(region_mean, label="%s(n=%s)" % (bed_label, str(entry_num)), marker='.') # plt.axvspan(extend, len(region_mean)-extend-1, color='grey', alpha=0.5) ax.set_ylim(0, 0.6) plt.legend() plt.tight_layout() plt.savefig(savefn) plt.close()
def plot_shape_tx_null_pct(out1=None, out2=None, out1_label='True', out2_label='Predict', savefn=None, species='human'): out_dict1 = util.read_icshape_out(out1) out_dict2 = util.read_icshape_out(out2) tx_common = set(out_dict1.keys()) & set(out_dict2.keys()) null_pct1_ls = [] null_pct2_ls = [] for tx in tx_common: null_pct1 = (out_dict1[tx]['reactivity_ls'].count('NULL') + out_dict1[tx]['reactivity_ls'].count('-1.0') + out_dict1[tx]['reactivity_ls'].count('-1')) / float( out_dict1[tx]['length']) null_pct2 = (out_dict2[tx]['reactivity_ls'].count('NULL') + out_dict2[tx]['reactivity_ls'].count('-1.0') + out_dict1[tx]['reactivity_ls'].count('-1')) / float( out_dict2[tx]['length']) null_pct1_ls.append(null_pct1) null_pct2_ls.append(null_pct2) print('{}: n={}'.format(out1, len(out_dict1))) print('{}: n={}'.format(out2, len(out_dict2))) print('common tx: n={}'.format(len(tx_common))) fa_dict = util.read_fa(fa=None, species=species, pureID=1) stat1 = util.shape_dict_stat(out_dict1, fa_dict, None, RNA_type=None, trim5Len=5, trim3Len=30) stat2 = util.shape_dict_stat(out_dict2, fa_dict, None, RNA_type=None, trim5Len=5, trim3Len=30) print(pd.DataFrame.from_dict(stat1, orient='index'), pd.DataFrame.from_dict(stat2, orient='index')) df = pd.DataFrame.from_dict({ out1_label: null_pct1_ls, out2_label: null_pct2_ls }) print(df.head()) fig, ax = plt.subplots(figsize=(6, 6)) sns.scatterplot(x=out1_label, y=out2_label, data=df, ax=ax, s=10) plt.xlabel('{} (null_pct: {:.2f})'.format(out1_label, stat1['total_bases(NULL_pct)'])) plt.ylabel('{} (null_pct: {:.2f})'.format(out2_label, stat2['total_bases(NULL_pct)'])) plt.tight_layout() plt.savefig(savefn) plt.close() return stat1, stat2
def search(species='human', savefn=None): fa_dict = util.read_fa(fa=None, species=species, pureID=1) with open(savefn, 'w') as SAVEFN: p = re.compile(r'[AG][AG]AC[ATC]') n = 0 for i, j in fa_dict.items(): for m in p.finditer(j[0:]): n += 1 # SAVEFN.write('\t'.join([i, str(m.span()[0]), str(m.span()[1]), str(n)+'_'+m.group(), '0', '+'])+'\n') SAVEFN.write('\t'.join([ i, str(m.span()[0] + 2), str(m.span()[0] + 3), str(n) + '_' + m.group(), '0', '+' ]) + '\n')
def generate_mask_region_validate(shape_out, tx, species, mask_start, mask_end, fragment_start, fragment_end, savefn_dir, plot_gradient=1): out_dict = util.read_icshape_out(out=shape_out, pureID=1) fa_dict = util.read_fa(fa=None, species=species, pureID=1) shape_true_ls = [] shape_mask_ls = [] for i in range(fragment_start, fragment_end): r = out_dict[tx]['reactivity_ls'][i] if i >= mask_start and i < mask_end: r_mask = -1 else: r_mask = r shape_true_ls.append(r) shape_mask_ls.append(r_mask) shape_true_ls = ['-1' if i == 'NULL' else i for i in shape_true_ls] shape_true_ls = map(str, shape_true_ls) shape_mask_ls = ['-1' if i == 'NULL' else i for i in shape_mask_ls] shape_mask_ls = map(str, shape_mask_ls) seq = fa_dict[tx][fragment_start:fragment_end] savefn = '{}/{}.F{}-{}.M{}-{}.txt'.format(savefn_dir, tx, fragment_start, fragment_end, mask_start, mask_end) with open(savefn, 'w') as SAVEFN: SAVEFN.write('\t'.join( map(str, [ tx, '1869', fragment_start, fragment_end, '.', '.', seq, ','.join(shape_mask_ls), ','.join(shape_true_ls) ])) + '\n') if plot_gradient: plot_savefn = savefn.replace('.txt', '.gradient.pdf') subprocess.call([ "cd /home/gongjing/project/shape_imputation/ShapeImputation/scripts; python gradcam_SHAPEimpute.py --filename_validation {} --plot_savefn {}" .format(savefn, plot_savefn) ], shell=True) return savefn
def generate_windows(out=None, window_len_ls=None, sliding_ls=None, species=None, all_valid_reactivity=0, null_pct_max=0.9, split_train_validate=0, generate_random_null_and_ratio=0): if out is None: out = '/home/gongjing/project/shape_imputation/data/DMSseq_fibroblast_vivo/3.shape/shape.c200T2M0m0.out' if window_len_ls is None: window_len_ls = [50, 100] if species is None: species = 'human' fa_dict = util.read_fa(species=species) save_dir = out + '.windowsHasNull' util.check_dir_or_make(save_dir) for window_len in window_len_ls: for sliding in range(window_len, window_len + 1, 10): savefn = save_dir + '/' + 'windowLen%s.sliding%s.txt' % ( window_len, sliding) # util.shape_fragmentation(out=out, savefn=savefn, window_len=window_len, sliding=sliding, all_valid_reactivity=1) # no null shape_fragmentation(out=out, fa_dict=fa_dict, savefn=savefn, window_len=window_len, sliding=sliding, all_valid_reactivity=all_valid_reactivity, null_pct_max=null_pct_max) # has null if split_train_validate: np.random.seed(1234) csv_train, csv_validate = util.fragment_split(fragment=savefn, train_frac=0.7, cols=8) if generate_random_null_and_ratio: data_random_null_filterNULL( csv_train, null_pct=generate_random_null_and_ratio, col=9, savefn=None, seed=1234) data_random_null_filterNULL( csv_validate, null_pct=generate_random_null_and_ratio, col=9, savefn=None, seed=1234)
def generate_dbn_react(shape_out=None, species=None, dots=None, savefn_prefix=None, min_len=0): dbn = savefn_prefix + '.dbn' react = savefn_prefix + '.react' shape_dict = util.read_icshape_out(shape_out, pureID=0) fa_dict = util.read_fa(fa=None, species=species, pureID=0) dot_dict = util.read_dots(dot=dots) DBN = open(dbn, 'w') REACT = open(react, 'w') for i in shape_dict: if i in fa_dict: seq = fa_dict[i][0:] else: continue if i in dot_dict: dot = dot_dict[i]['dotbracket'] else: continue print(i, seq, dot, shape_dict[i]['reactivity_ls']) if len(seq) < min_len: continue if len(set([len(seq), len(dot), len(shape_dict[i]['reactivity_ls'])])) != 1: continue DBN.write('>' + i + '\n') DBN.write(seq + '\n') DBN.write(dot + '\n') DBN.write('\n') REACT.write('>' + i + '\n') for n, v in enumerate(shape_dict[i]['reactivity_ls']): REACT.write(str(n + 1) + '\t' + v.replace('NULL', 'NA') + '\n') REACT.write('\n') DBN.close() REACT.close()
def extract_start_codon_shape(species, shape, savefn, max_null_pct=0.4, extend=50): trans_dict = util.loadTransGtfBed2(species=species) out_dict = util.read_icshape_out(out=shape, pureID=1) fa_dict = util.read_fa(fa=None, species=species, pureID=1) # start codon savefn1 = savefn.replace('.shape', '.start_codon.ok.shape') savefn2 = savefn.replace('.shape', '.start_codon.null.shape') SAVEFN1 = open(savefn1, 'w') SAVEFN2 = open(savefn2, 'w') tx_with_shape = [] for i,j in out_dict.items(): if i not in trans_dict: continue if int(trans_dict[i]['utr_5_end']) < extend: continue if int(trans_dict[i]['cds_end']) - int(trans_dict[i]['cds_start']) < extend: continue tx_with_shape.append(i) # 0-based start = int(trans_dict[i]['utr_5_end']) - 49 - 1 end = int(trans_dict[i]['cds_start']) + 49 shape = out_dict[i]['reactivity_ls'][start:end] seq = fa_dict[i][start:end] null_pct = shape.count('NULL') / len(shape) shape_str = ','.join(shape).replace('NULL', '-1') if null_pct == 1: continue if null_pct >= max_null_pct: state = 'null' SAVEFN2.write('\t'.join(map(str, [i, len(fa_dict[i][0:]), start, end, '.', '.', seq, shape_str, shape_str]))+'\n') # SAVEFN2.write('\t'.join(map(str, [i, start, end]))+'\n') else: state = 'ok' SAVEFN1.write('\t'.join(map(str, [i, len(fa_dict[i][0:]), start, end, '.', '.', seq, shape_str, shape_str]))+'\n') # SAVEFN1.write('\t'.join(map(str, [i, start, end]))+'\n') SAVEFN1.close() SAVEFN2.close() print('tx_with_shape: {}'.format(len(tx_with_shape))) savefn1_sort,_ = util.sort_two_shape(shape1=savefn1, value_col1=7, shape2=savefn1) savefn2_sort,_ = util.sort_two_shape(shape1=savefn2, value_col1=7, shape2=savefn2) df1 = util.plot_heatmap(fn=savefn1_sort, savefn=savefn1_sort+'.heatmap.pdf', value_col=7, fig_size_x=10, fig_size_y=20, cmap='summer', facecolor='black') df2 = util.plot_heatmap(fn=savefn2_sort, savefn=savefn2_sort+'.heatmap.pdf', value_col=7, fig_size_x=10, fig_size_y=20, cmap='summer', facecolor='black') df1_mean = list(df1.mean()) df2_mean = list(df2.mean()) fig,ax=plt.subplots(figsize=(16,8)) ax.plot(df2_mean, label="%s(n=%s)"%('null',df2.shape[0]), marker='.') ax.plot(df1_mean, label="%s(n=%s)"%('ok',df1.shape[0]), marker='.') ax.set_ylim(0,0.6) plt.legend() plt.tight_layout() plt.savefig(savefn1_sort+'.meta.pdf') plt.close() # stop codon savefn1 = savefn.replace('.shape', '.stop_codon.ok.shape') savefn2 = savefn.replace('.shape', '.stop_codon.null.shape') SAVEFN1 = open(savefn1, 'w') SAVEFN2 = open(savefn2, 'w') tx_with_shape = [] for i,j in out_dict.items(): if i not in trans_dict: continue if int(trans_dict[i]['utr_3_end']) - int(trans_dict[i]['utr_3_start']) < extend: continue if int(trans_dict[i]['cds_end']) - int(trans_dict[i]['cds_start']) < extend: continue tx_with_shape.append(i) # 0-based start = int(trans_dict[i]['cds_end']) - 49 - 1 end = int(trans_dict[i]['utr_3_start']) + 49 shape = out_dict[i]['reactivity_ls'][start:end] null_pct = shape.count('NULL') / len(shape) seq = fa_dict[i][start:end] shape_str = ','.join(shape).replace('NULL', '-1') if null_pct == 1: continue if null_pct >= max_null_pct: state = 'null' SAVEFN2.write('\t'.join(map(str, [i, len(fa_dict[i][0:]), start, end, '.', '.', seq, shape_str, shape_str]))+'\n') # SAVEFN2.write('\t'.join(map(str, [i, start, end]))+'\n') else: state = 'ok' SAVEFN1.write('\t'.join(map(str, [i, len(fa_dict[i][0:]), start, end, '.', '.', seq, shape_str, shape_str]))+'\n') # SAVEFN1.write('\t'.join(map(str, [i, start, end]))+'\n') SAVEFN1.close() SAVEFN2.close() print('tx_with_shape: {}'.format(len(tx_with_shape))) savefn1_sort,_ = util.sort_two_shape(shape1=savefn1, value_col1=7, shape2=savefn1) savefn2_sort,_ = util.sort_two_shape(shape1=savefn2, value_col1=7, shape2=savefn2) df1 = util.plot_heatmap(fn=savefn1_sort, savefn=savefn1_sort+'.heatmap.pdf', value_col=7, fig_size_x=10, fig_size_y=20, cmap='summer', facecolor='black') df2 = util.plot_heatmap(fn=savefn2_sort, savefn=savefn2_sort+'.heatmap.pdf', value_col=7, fig_size_x=10, fig_size_y=20, cmap='summer', facecolor='black') df1_mean = list(df1.mean()) df2_mean = list(df2.mean()) fig,ax=plt.subplots(figsize=(16,8)) ax.plot(df2_mean, label="%s(n=%s)"%('null',df2.shape[0]), marker='.') ax.plot(df1_mean, label="%s(n=%s)"%('ok',df1.shape[0]), marker='.') ax.set_ylim(0,0.6) plt.legend() plt.tight_layout() plt.savefig(savefn1_sort+'.meta.pdf') plt.close()
def complete_shape_out(icshape=None, species='human', predict_label=None, predict_model=None, pct=0.5, window_len=100, sliding=50, output_dir=None, gpu_id=1): if not os.path.isdir(output_dir): os.mkdir(output_dir) fa_dict = util.read_fa(fa=None, species=species, pureID=1) icshape_fragment_all = output_dir + '/' + 'allfragment.txt' icshape_fragment_all2 = icshape_fragment_all + '2' util.shape_fragmentation(out=icshape, fa_dict=fa_dict, savefn=icshape_fragment_all, window_len=window_len, sliding=sliding, all_valid_reactivity=0, null_pct_max=2) cmd = '''awk '{print $0"\\t"$NF}' ''' + " {} > {}; sed -i 's/NULL/-1/g' {}".format( icshape_fragment_all, icshape_fragment_all2, icshape_fragment_all2) # print(cmd) subprocess.call([cmd], shell=True) icshape_fragment_pct = output_dir + '/' + 'allfragment.{}.txt'.format(pct) util.shape_fragmentation(out=icshape, fa_dict=fa_dict, savefn=icshape_fragment_pct, window_len=window_len, sliding=sliding, all_valid_reactivity=0, null_pct_max=pct) icshape_fragment_pct2 = icshape_fragment_pct + '2' cmd = '''awk '{print $0"\\t"$NF}' ''' + " {} > {}; sed -i 's/NULL/-1/g' {}".format( icshape_fragment_pct, icshape_fragment_pct2, icshape_fragment_pct2) # print(cmd) subprocess.call([cmd], shell=True) predict = output_dir + '/' + 'predict.txt' cmd_predict = 'bash predict_new_icshape.sh {} {} {} {}'.format( gpu_id, icshape_fragment_pct2, predict, predict_model) subprocess.call([cmd_predict], shell=True) # predict = '/home/gongjing/project/shape_imputation/exper/{}/prediction.{}.txt'.format(predict_model, predict_label) predict_shape_out = predict.replace('.txt', '.out') util.predict_to_shape(validation=icshape_fragment_pct2, predict=predict, shape_out=predict_shape_out) # 从总的fragment减去小于pct的,得到剩余的不用预测的大于pct的fragment icshape_fragment_exceed_pct2 = output_dir + '/' + 'allfragment.exceed{}.txt2'.format( pct) cmd = ''' awk 'NR==FNR{a[$1$3$4];next} !($1$3$4 in a){print $0}' ''' + '''{} {} > {}'''.format( icshape_fragment_pct2, icshape_fragment_all2, icshape_fragment_exceed_pct2) subprocess.call([cmd], shell=True) # 对大于pct的fragment生成预测值文件:直接使用原来的值(即最后一列即可) icshape_fragment_exceed_pct2_predict = icshape_fragment_exceed_pct2 + '.predict' # not truly predict but generate a pseudo file cmd = ''' awk '{print $NF}' ''' + ''' {} > {} '''.format( icshape_fragment_exceed_pct2, icshape_fragment_exceed_pct2_predict) subprocess.call([cmd], shell=True) # 重新合并小于pct和大于pct的fragment文件(相当于validation文件) icshape_fragment_pct_plus_exceed_pct2 = output_dir + '/' + 'allfragment.{}+exceed{}.txt2'.format( pct, pct) cmd = ''' cat {} {} > {}'''.format(icshape_fragment_pct2, icshape_fragment_exceed_pct2, icshape_fragment_pct_plus_exceed_pct2) subprocess.call([cmd], shell=True) # 合并预测的文件 icshape_fragment_pct_plus_exceed_predict = output_dir + '/' + 'allfragment.{}+exceed{}.txt2.predict'.format( pct, pct) cmd = ''' cat {} {} > {} '''.format( predict, icshape_fragment_exceed_pct2_predict, icshape_fragment_pct_plus_exceed_predict) subprocess.call([cmd], shell=True) # 根据重新合并的validation,预测文件,生成.out文件 icshape_fragment_pct_plus_exceed_predict_shapeout = icshape_fragment_pct_plus_exceed_predict + '.out' util.predict_to_shape( validation=icshape_fragment_pct_plus_exceed_pct2, predict=icshape_fragment_pct_plus_exceed_predict, shape_out=icshape_fragment_pct_plus_exceed_predict_shapeout) # 画真实和预测的null pct scatter savefn = icshape_fragment_pct_plus_exceed_predict_shapeout + '.scatter.pdf' stat1, stat2 = plot_two_shape_common_tx_pct.plot_shape_tx_null_pct( out1=icshape, out2=icshape_fragment_pct_plus_exceed_predict_shapeout, out1_label='True', out2_label='Predict', savefn=savefn, species=species) return icshape_fragment_pct_plus_exceed_predict_shapeout, stat1, stat2
def get_stat(coverage_ls=None, RT_ls=None, prefix=None): if coverage_ls is None: coverage_ls = [0, 50, 100, 150, 200, 250] if RT_ls is None: RT_ls = [0, 1, 2, 3] if prefix is None: prefix = '/Share2/home/zhangqf5/gongjing/RNA-structure-profile-imputation/data/hek_wc_vivo/3.shape' fa_dict = util.read_fa(species='human') trans_dict = util.loadTransGtfBed2(species='human') stat_dict = nested_dict(3, int) for coverage in coverage_ls: for RT in RT_ls: shape_out = '%s/shape.c%sT%sM0m0.out' % (prefix, coverage, RT) out_dict = util.read_icshape_out(shape_out) out_dict_stat = util.shape_dict_stat(shape_dict=out_dict, fa_dict=fa_dict, trans_dict=trans_dict, RNA_type='all') print(out_dict_stat) stat_dict['tx_count'][coverage][RT] = len(out_dict) stat_dict['total_bases'][coverage][RT] = out_dict_stat[ 'total_bases'] stat_dict['total_bases(NULL_pct)'][coverage][RT] = out_dict_stat[ 'total_bases(NULL_pct)'] stat_dict['A(NULL_pct)'][coverage][RT] = out_dict_stat[ 'A(NULL_pct)'] stat_dict['T(NULL_pct)'][coverage][RT] = out_dict_stat[ 'T(NULL_pct)'] stat_dict['C(NULL_pct)'][coverage][RT] = out_dict_stat[ 'C(NULL_pct)'] stat_dict['G(NULL_pct)'][coverage][RT] = out_dict_stat[ 'G(NULL_pct)'] print(pd.DataFrame.from_dict(stat_dict['tx_count'], orient='index')) print(pd.DataFrame.from_dict(stat_dict['total_bases'], orient='index')) print( pd.DataFrame.from_dict(stat_dict['total_bases(NULL_pct)'], orient='index')) # fig,ax=plt.subplots() # sns.heatmap(tx_count_df,annot=True,fmt='d',linewidths=0.5) # ax.set_yticklabels(ax.yaxis.get_majorticklabels(), rotation=0) # ax.set_xlabel('Cutoff (average RT stop count)') # ax.set_ylabel('Cutoff (base density)') # savefn=prefix+'/shape.tx_num.pdf' # plt.tight_layout() # plt.savefig(savefn) # plt.close() util.heatmap(pd.DataFrame.from_dict(stat_dict['tx_count'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.tx_num.pdf', fmt='d') util.heatmap(pd.DataFrame.from_dict(stat_dict['total_bases'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.total_bases.pdf', fmt='.2g') util.heatmap(pd.DataFrame.from_dict(stat_dict['total_bases(NULL_pct)'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.total_null_pct.pdf', fmt='.2g') util.heatmap(pd.DataFrame.from_dict(stat_dict['A(NULL_pct)'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.A_null_pct.pdf', fmt='.2g') util.heatmap(pd.DataFrame.from_dict(stat_dict['T(NULL_pct)'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.T_null_pct.pdf', fmt='.2g') util.heatmap(pd.DataFrame.from_dict(stat_dict['C(NULL_pct)'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.C_null_pct.pdf', fmt='.2g') util.heatmap(pd.DataFrame.from_dict(stat_dict['G(NULL_pct)'], orient='index'), xlabel='Cutoff (average RT stop count)', ylabel='Cutoff (base density)', savefn=prefix + '/shape.G_null_pct.pdf', fmt='.2g')