def run(Nutlin1,GRODMSO,GRONutlin,figuredir,filedir): x = BedTool(Nutlin1) y = BedTool(GRODMSO) z = BedTool(GRONutlin) a = x.map(y, c='4', o='sum') b = x.map(z, c='4', o='sum') F = plt.figure() ax = F.add_subplot(111) ax.set_title('Nutlin1hr vs. DMSO') ax.set_ylabel('Count') ax.set_xlabel('Log2 Fold Change (Nutlin/DMSO)') ax.hist([math.log(float(n[3])/float(m[3]),2) for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.'], bins =100) ax.set_xlim([-20,20]) plt.axvline(0, color='red',linestyle='dashed') # ax.set_ylabel('Log2 Fold Change (Nutlin/DMSO)') # ax.set_xticklabels(['Nutlin/DMSO']) # bp = ax.boxplot([math.log(float(n[3])/float(m[3]),2) for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.'],patch_artist=True) # format_boxplot(bp) plt.savefig(figuredir + 'GRO_Analysis_Fold_Change_hist.png', dpi=1200) outfile = open(filedir + 'false_positives_GRO-Seq_fold_change.bed','w') for interval in [m[:3] for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.' and float(n[3])/float(m[3]) < 1]: outfile.write('\t'.join(interval) + '\n') outfile.close() outfile = open(filedir + 'p53_txn_fold_change.bed','w') for interval in sorted([(m[0],m[1],m[2],float(n[3])/float(m[3])) for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.'], key=lambda x: x[-1], reverse=True): outfile.write('\t'.join(interval[:3]) + '\t' + str(interval[-1]) + '\n') outfile.close()
def calc_signals(bam_filename, region_bed_filename, signal_colnum, region_type, normalize, verbose): ''' generator to calculate signals from BED regions mapped onto positive and negative strand data.''' region_bedtool = BedTool(region_bed_filename) # bedtools.map operations operations = ('sum', 'count') signal_type = 'raw' if normalize: signal_type = 'norm' for signal_strand in STRANDS: signal_bedtool = load_coverage(bam_filename, strand=signal_strand, verbose=verbose) for oper in operations: map_bedtool = region_bedtool.map(signal_bedtool, o=oper, c=signal_colnum, null=0) for region_row, signal_row in izip(region_bedtool, map_bedtool): try: region_name = region_row[3] region_score = region_row[4] region_strand = region_row[5] except IndexError: region_name = '%s-%s-%d-%d' % ( region_type, region_row.chrom, region_row.start, region_row.end) region_score = 0 # default region_strand = 'none' if region_strand == '+': region_strand = 'pos' elif region_strand == '-': region_strand = 'neg' # last field is the calculated signal signal = float(signal_row[-1]) if normalize and signal != 0: region_size = float(region_row.end - region_row.start) signal = signal / region_size result = (region_name, region_score, 'region-' + region_strand, region_type, 'signal-' + signal_strand, oper, signal, signal_type) yield result
def make_annot_files(args, df, binary): df = df.sort_values(by=['CHR', 'START']) if binary == True: iter_df = [['chr' + (str(x1)).lstrip('chr'), x2, x3] for (x1, x2, x3) in np.array(df[['CHR', 'START', 'END']])] genesetbed = BedTool(iter_df).sort().merge() elif binary == False: iter_df = [[ 'chr' + (str(x1).lstrip('chr')), int(x2), int(x3), 'annot', str(x4) ] for (x1, x2, x3, x4) in np.array(df[['CHR', 'START', 'END', 'ANNOT']])] genesetbed = BedTool(iter_df).sort() print('making annot file for chromosome {}'.format(args.chrom)) df_bim = pd.read_csv(args.bfile_chr + str(args.chrom) + '.bim', delim_whitespace=True, usecols=[0, 1, 2, 3], names=['CHR', 'SNP', 'CM', 'BP']) iter_bim = [['chr' + str(x1), int(x2), int(x2)] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])] bimbed = BedTool(iter_bim).sort() if binary == True: annotbed = bimbed.intersect(genesetbed) bp = [x.start for x in annotbed] df_int = pd.DataFrame({'BP': bp, 'ANNOT': 1}) df_annot = pd.merge(df_bim, df_int, how='left', on='BP') df_annot.fillna(0, inplace=True) #df_annot = df_annot[['ANNOT']].astype(int) df_annot.drop_duplicates(inplace=True) else: annotbed = bimbed.map(genesetbed, c=5, o='mean', null=0).to_dataframe() bp = annotbed.start annot = annotbed.name df_int = pd.DataFrame({'BP': bp, 'ANNOT': annot}) df_annot = pd.merge(df_bim, df_int, how='left', on='BP') df_annot.drop_duplicates(inplace=True) if df_annot.shape[0] != df_bim.shape[0]: print( '{} SNPs in annotation df, whereas {} SNPs in bim file'.format( df_annot.shape[0], df_bim.shape[0])) sys.exit(1) num_snps_final = df_annot.ANNOT.count() df_annot.fillna(0, inplace=True) df_annot = df_annot[['ANNOT']].astype(float) df_bim['ANNOT'] = df_annot[['ANNOT']] cont_annot = df_bim[['SNP', 'ANNOT']] cont_annot_file = args.prefix + '.' + str(args.chrom) + '.cont_bin.gz' cont_annot.to_csv(cont_annot_file, sep="\t", index=False, header=None, compression='gzip') annot_file = args.prefix + '.' + str(args.chrom) + '.annot.gz' df_annot.to_csv(annot_file, sep="\t", index=False, compression='gzip')
def calc_signals(bam_filename, region_bed_filename, signal_colnum, region_type, normalize, verbose): ''' generator to calculate signals from BED regions mapped onto positive and negative strand data.''' region_bedtool = BedTool(region_bed_filename) # bedtools.map operations operations = ('sum','count') signal_type = 'raw' if normalize: signal_type = 'norm' for signal_strand in STRANDS: signal_bedtool = load_coverage(bam_filename, strand=signal_strand, verbose=verbose) for oper in operations: map_bedtool = region_bedtool.map(signal_bedtool, o=oper, c=signal_colnum, null=0) for region_row, signal_row in izip(region_bedtool, map_bedtool): try: region_name = region_row[3] region_score = region_row[4] region_strand = region_row[5] except IndexError: region_name = '%s-%s-%d-%d' % (region_type, region_row.chrom, region_row.start, region_row.end) region_score = 0 # default region_strand = 'none' if region_strand == '+': region_strand = 'pos' elif region_strand == '-': region_strand = 'neg' # last field is the calculated signal signal = float(signal_row[-1]) if normalize and signal != 0: region_size = float(region_row.end - region_row.start) signal = signal / region_size result = (region_name, region_score, 'region-'+region_strand, region_type, 'signal-'+signal_strand, oper, signal, signal_type) yield result
def matching_svs(brass_bedpes, ascat_beds, offsets=[1e3, 1e4, 1e5]): sv_df, ratio_df = [], [] for dist_offset in offsets: for c in brass_bedpes: svs = sv_cn_aligned(brass_bedpes[c], ascat_beds[c].drop(['sample'], axis=1), offset=dist_offset) if svs.shape[0] != 0: names = ['chr', 'start_sv', 'end_sv', 'start_cn', 'end_cn', 'cn', 'sv'] svs = svs[svs['start_sv'] < svs['end_sv']] svs_bed = BedTool(svs[names].to_string(index=False, header=False), from_string=True).sort() svs_sgrnas = svs_bed.map(crispr_beds[c], c='4', o='mean,count') \ .to_dataframe(names=names + ['fc_mean', 'fc_count']) ratios_sgrnas = svs_bed.map(crispr_beds[c], c='5', o='mean,count') \ .to_dataframe(names=names + ['ratio_mean', 'ratio_count']) sv_align = pd.concat([ svs_sgrnas.set_index(names), ratios_sgrnas.set_index(names) ], axis=1) sv_df.append( sv_align\ .reset_index()\ .assign(sample=c)\ .assign(offset=dist_offset) ) sv_df = pd.concat(sv_df).query("fc_mean != '.'").reset_index(drop=True) sv_df['fc_mean'] = sv_df['fc_mean'].astype(float).values sv_df['fc_count'] = sv_df['fc_count'].astype(int).values sv_df['ratio_mean'] = sv_df['ratio_mean'].astype(float).values sv_df['ratio_count'] = sv_df['ratio_count'].astype(int).values return sv_df
def interval_counts(bedtool, interval_size, chrom_size_filename, only_chroms, ignore_chroms, verbose): result = defaultdict() # make windows for analysis windows = BedTool().window_maker(w=interval_size, g=chrom_size_filename).sort() # collapse per inteval (comma delim counts, or '0') mapresult = windows.map(bedtool, o='collapse', c=4, null=0) total_intervals = 0 for idx, row in enumerate(mapresult): if (only_chroms and row.chrom not in only_chroms) or \ (ignore_chroms and row.chrom in ignore_chroms): continue if row.end - row.start < interval_size: continue nums = [int(i) for i in row.name.split(',')] counts = Counter(nums) # find number of non-zero counts total_counts = sum([i for i in counts.values() if i > 0]) total_size = int(row.end - row.start) num_zeros = total_size - total_counts # change the 0 counts to the calculated number counts[0] = num_zeros result[idx] = counts total_intervals += 1 if verbose: print >>sys.stderr, ">> seen %d intervals of obs data" \ % total_intervals return (result, total_intervals)
def calc_signals(bam_filename, region_bed_filename, signal_colnum, verbose): ''' generator to calculate signals from BED regions mapped onto positive and negative strand data.''' region_bedtool = BedTool(region_bed_filename) # bedtools.map operations operations = ('sum','count') for signal_strand in STRANDS: signal_bedtool = load_coverage(bam_filename, strand=signal_strand, verbose=verbose) for oper in operations: map_bedtool = region_bedtool.map(signal_bedtool, o=oper, c=signal_colnum, null=0) for region_row, signal_row in izip(region_bedtool, map_bedtool): region_name = region_row[3] region_score = region_row[4] region_strand = region_row[5] if region_strand == '+': region_strand = 'pos' else: region_strand = 'neg' signal = signal_row[6] result = (region_name, region_score, region_strand, signal_strand, oper, signal) yield result
def overlay_resources_score_motifs(motif_sites_input_file, motifs_overlapping_tracks_output_dir, chromatin_tracks_dir_path, chromatin_tracks_files): """intersect motifs with chromatin tracks, sort and group the tracks per motif Input: moitf instances file (motif pos, name_id, scorePval, strand) chromatin data collection file in bed4 format; track pos, track cell#assaytype#value or cell#TFname in case of chip-seq Return a file in bed7 format (motif info (6cols), overlapping_tracks. """ #for motif_sites_input_file in motif_sites_input_files: with open(motif_sites_input_file) as f: chr_n_file = f.readline().strip().split('\t')[0].strip()+'.bed' if (chr_n_file in chromatin_tracks_files):#it is assumed for every motif file name there exists a matching file name in the chromatin_tracks_input_dir motifs_overlapping_tracks_file = motifs_overlapping_tracks_output_dir+'/' + '.'.join(motif_sites_input_file.split('/')[-1].split('.')[0:-1])+'_overlapping_tracks' + '.bed7' motifs_overlapping_tracks_file_tmp = motifs_overlapping_tracks_file + '_tmp' print("in overlay_resources_score_motifs: " + motifs_overlapping_tracks_file) if not os.path.exists(motifs_overlapping_tracks_file): motif_sites_input_file_sorted = motif_sites_input_file + '_sorted' chromatin_tracks_input_file = chromatin_tracks_dir_path +'/'+ chr_n_file chromatin_tracks_input_file_sorted = chromatin_tracks_input_file + '_sorted' print("intersecting: " + motif_sites_input_file + ' and ' + chromatin_tracks_input_file) os.system("""sort -k1,1 -k2,2n -k3,3n {} > {}""".format(motif_sites_input_file, motif_sites_input_file_sorted)) os.system("""sort -k1,1 -k2,2n -k3,3n {} > {}""".format(chromatin_tracks_input_file, chromatin_tracks_input_file_sorted)) motif_sites_file_obj = BedTool(motif_sites_input_file_sorted) motif_sites_file_obj.map(BedTool(chromatin_tracks_input_file_sorted), c=4, o=['collapse']).saveas(motifs_overlapping_tracks_file_tmp) with open(motifs_overlapping_tracks_file_tmp, 'r') as infile, open(motifs_overlapping_tracks_file, 'w') as outfile: line = infile.readline() while line: sline = line.split('\t') #print(sline) if(len(sline)>6): if(sline[7]!='.'): my_list=sline[7].split(',') cell_assay_values_dict_ChromHMM = {} cell_assay_values_dict_cCRE = {} cell_assay_values_dict_IndexDHS = {} cell_assay_values_dict_RegElem = {} cell_assay_values_dict_DNaseq = {} elem_list =[] #elem_list_EpiMap =[] for elem in my_list: #print(elem) cell_value=elem.split('#')[0] assay_value = elem.split('#')[1] if(len(elem.split('#'))>2): state_value = elem.split('#')[2].rstrip("\n") if assay_value== "ChromHMM": if cell_value not in cell_assay_values_dict_ChromHMM.keys(): cell_assay_values_dict_ChromHMM[cell_value] = [] cell_assay_values_dict_ChromHMM[cell_value].append(state_value) elif assay_value== "cCRE": if cell_value not in cell_assay_values_dict_cCRE.keys(): cell_assay_values_dict_cCRE[cell_value] = [] cell_assay_values_dict_cCRE[cell_value].append(state_value) elif assay_value== "IndexDHS": if cell_value not in cell_assay_values_dict_IndexDHS.keys(): cell_assay_values_dict_IndexDHS[cell_value] = [] cell_assay_values_dict_IndexDHS[cell_value].append(state_value) elif assay_value== "RegElem": if cell_value not in cell_assay_values_dict_RegElem.keys(): cell_assay_values_dict_RegElem[cell_value] = [] cell_assay_values_dict_RegElem[cell_value].append(state_value) elif assay_value== "DNase-seq": if cell_value not in cell_assay_values_dict_DNaseq.keys(): cell_assay_values_dict_DNaseq[cell_value] = [] cell_assay_values_dict_DNaseq[cell_value].append(float(state_value)) else: elem_list.append(elem.rstrip("\n")) for cell in cell_assay_values_dict_ChromHMM: #print(cell) #print(cell+"#ChromHMM#"+Counter(cell_assay_values_dict_ChromHMM[cell]).most_common(1)[0][0]) elem_list.append(cell+"#ChromHMM#"+Counter(cell_assay_values_dict_ChromHMM[cell]).most_common(1)[0][0]) for cell in cell_assay_values_dict_cCRE.keys(): #print(cell+"#cCRE#"+Counter(cell_assay_values_dict_cCRE[cell_value]).most_common(1)[0][0]) elem_list.append(cell+"#cCRE#"+Counter(cell_assay_values_dict_cCRE[cell]).most_common(1)[0][0]) for cell in cell_assay_values_dict_IndexDHS.keys(): #print(cell_assay_values_dict_IndexDHS[cell]) elem_list.append(cell+"#IndexDHS#"+Counter(cell_assay_values_dict_IndexDHS[cell]).most_common(1)[0][0]) for cell in cell_assay_values_dict_RegElem.keys(): #print(cell_assay_values_dict_IndexDHS[cell]) elem_list.append(cell+"#RegElem#"+Counter(cell_assay_values_dict_RegElem[cell]).most_common(1)[0][0]) for cell in cell_assay_values_dict_DNaseq.keys(): #print(cell_assay_values_dict_IndexDHS[cell]) elem_list.append(cell+"#DNase-seq#"+str(max(cell_assay_values_dict_DNaseq[cell]))) outfile.write('\t'.join(sline[0:7])+'\t'+','.join(elem_list)+'\n') line = infile.readline() os.remove(motif_sites_input_file_sorted) os.remove(chromatin_tracks_input_file_sorted) os.remove(motifs_overlapping_tracks_file_tmp) cleanup() return motifs_overlapping_tracks_file