def __yield_score_dfm(self, snp_dfm): snp_bed_obj = BedTool(snp_dfm.to_string(index=False, header=False, index_names=False), from_string=True) for key, bed_fn in self.src_data_fn.items(): rep_bed_fn = os.path.join(self.src_data_dir, bed_fn) rep_bed_obj = BedTool(rep_bed_fn) # Downstream scores closest_iu = snp_bed_obj.closest(rep_bed_obj, D='ref', iu=True) closest_iu_dfm = pd.read_table(StringIO(str(closest_iu)), header=None, names=[ 'snpChrom', 'snpChromStart', 'snpChromEnd', 'snpName', 'repChrom', 'repChromStart', 'repChromEnd', 'repScore', 'distance' ], usecols=['snpName', 'repScore']) closest_iu_dfm = closest_iu_dfm.rename(columns={ 'snpName': 'name', 'repScore': 'iu_score' }) # Upstream scores closest_id = snp_bed_obj.closest(rep_bed_obj, D='ref', id=True) closest_id_dfm = pd.read_table(StringIO(str(closest_id)), header=None, names=[ 'snpChrom', 'snpChromStart', 'snpChromEnd', 'snpName', 'repChrom', 'repChromStart', 'repChromEnd', 'repScore', 'distance' ], usecols=['snpName', 'repScore']) closest_id_dfm = closest_id_dfm.rename(columns={ 'snpName': 'name', 'repScore': 'id_score' }) # score_dfm = pd.concat([closest_iu_dfm, closest_id_dfm], axis=1) score_dfm = closest_iu_dfm.merge(closest_id_dfm, on='name') score_dfm = score_dfm.assign(avg_score=(score_dfm['iu_score'] + score_dfm['id_score']) / 2.0). \ drop(['iu_score', 'id_score'], axis=1) score_dfm = score_dfm.rename(columns={'avg_score': key}) yield score_dfm
def find_closest_genes(peaks, annotation, annotationFeature, filteredoutput, referencePoint, filename): """ Find the closest gene using bedtools.closest """ Peaks = BedTool(peaks) Annotation = BedTool(annotation) Peaks = Peaks.sort() sites = Annotation.sort() if annotationFeature: filteredAnnotation = __filter_annotation(filteredoutput, annotationFeature, annotation, referencePoint) sites = BedTool(filteredAnnotation).sort() elif referencePoint: filteredAnnotation = list() for feature in gffutils.DataIterator(annotation): filteredAnnotation.append(str(__get_reference_coordinate(feature, referencePoint))) sites = BedTool(filteredAnnotation).sort() mapped = Peaks.closest(sites, t="first") if filename: mapped.saveas(filename) return(mapped)
def process(self): all_sites = pd.read_csv(self.sites_file, usecols=['chr', 'coordinate']) all_sites = get_winid.convert_chr_to_num(all_sites) chrs = np.sort(all_sites['chr'].unique()) all_sites_closest = [] for chr in chrs: print('processing sites on chr ' + str(chr)) chr_file = self.data_dir + 'chr' + chr + '.tsv' if not os.path.exists(chr_file): self.split_by_chr() chr_sites = all_sites.query('chr==@chr') chr_sites['coordinate'] = chr_sites['coordinate'].astype('i8') chr_sites['end'] = chr_sites['coordinate'] + 1 chr_sites = BedTool([tuple(x[1]) for x in chr_sites.iterrows()]) chr_sites_closest = chr_sites.closest(chr_file, d=True, nonamecheck=True) for row in chr_sites_closest: all_sites_closest.extend( [[row[0], row[1], row[6], row[7], row[8]]]) del chr_sites_closest del chr_sites gc.collect() all_sites_closest = pd.DataFrame(all_sites_closest, columns=[ 'chr', 'coordinate', 'eigen_phred', 'eigen_pc_phred', 'distiance_to_nearest_eigen' ]) all_sites_closest = all_sites_closest.groupby( ['chr', 'coordinate']).apply(self.mean_max).reset_index() with pd.HDFStore(self.additional_feature_file, 'a') as h5s: h5s['Eigen'] = all_sites_closest
def feat_dist(vf, af, name): print "inside feat_dist" v = BedTool(vf) a = BedTool(af) closest = v.closest(a, D="b") results = dict([ (, int(r[len(r.fields)-1])) for r in closest ]) print "exiting feat_dist" return Series(results, name=name)
def main(): """ Runs Python example from the manuscript """ bedtools_dir = path.split(__file__)[0] snps = BedTool(path.join(bedtools_dir, '../test/data/snps.bed.gz')) genes = BedTool(path.join(bedtools_dir, '../test/data/hg19.gff')) intergenic_snps = (snps - genes) nearby = genes.closest(intergenic_snps, d=True, stream=True) for gene in nearby: if int(gene[-1]) < 5000: print
def computeFromBed(vcfrecord,bedtoolObj,loc_simple,loc_complex): for alt in vcfrecord.ALT: #if not alt: #"{} record may have an invalid VCF syntax to represent a deletion event.".format(vcfrecord)) # record_bed=BedTool('{} {} {}'.format(vcfrecord.CHROM,vcfrecord.POS-1,vcfrecord.POS + len(vcfrecord.REF)-0, from_string=True)) if len(vcfrecord.REF) <= len(alt) : #SNP or #Insertions record_bed=BedTool('{} {} {}'.format(vcfrecord.CHROM,vcfrecord.POS-1,vcfrecord.POS), from_string=True) else: #Deletions record_bed=BedTool('{} {} {}'.format(vcfrecord.CHROM,vcfrecord.POS-1,vcfrecord.POS + len(vcfrecord.REF)-len(alt)), from_string=True) isec=record_bed.closest(bedtoolObj,D="b")#d=True)#,wb=False) simple=getLocation(str(isec).rstrip().split()[-1],loc_simple) complex=getLocation(str(isec).rstrip().split()[-1],loc_complex) updateglobalsLocation(complex) vcfrecord.INFO["LOC"] = simple vcfrecord.INFO["LOC_DETAIL"] = complex return vcfrecord
def annotatePAS(DB_file, pas_generator, chromosome, strand): if DB_file is not None: long_bed_str = '' if pas_generator is not None: pas_dict = generator_to_dict(pas_generator) _i = 1 for _pos in pas_dict: _bl = '\t'.join( str(e) for e in [ chromosome, int(_pos) - 1, _pos, pas_dict[_pos], chromosome + ":" + strand + ":" + str(_i), strand ]) + '\n' long_bed_str += _bl _i += 1 pas_bed = BedTool(long_bed_str, from_string=True) pas_bed = pas_bed.sort() anno_pas_bed = pas_bed.closest(DB_file, s=True, D='b') annotated_pas_out = [] for _apb in anno_pas_bed: annotated_pas_out.append( (_apb[0], _apb[1], _apb[2], _apb[3], _apb[4], _apb[5], _apb[9], _apb[10], _apb[12])) return annotated_pas_out else: return else: if pas_generator is not None: pas_out = [] pas_dict = generator_to_dict(pas_generator) _i = 1 for _pos in pas_dict: pas_out.append( (chromosome, int(_pos) - 1, _pos, sub_pas[_pos], chromosome + ":" + strand + ":" + str(_i), strand)) _i += 1 return pas_out else: return
def add_closest(aname, bname): a, b = BedTool(aname), BedTool(bname) afields = a.field_count() c = a.closest(b, d=True) get_name = gen_get_name(b, afields) dbed = open(BedTool._tmp(), "w") # keep the name and distance seen_by_line = collections.defaultdict(list) for feat in c: key = "\t".join(feat[:afields]) seen_by_line[key].append([feat[-1], get_name(feat)]) for key, dist_names in seen_by_line.items(): if len(dist_names) > 0: assert len(set([d[0] for d in dist_names])) == 1 names = ",".join(sorted(set(d[1] for d in dist_names))) new_line = "\t".join([key] + [names] + [dist_names[0][0]]) dbed.write(new_line + "\n") dbed.close() d = BedTool( assert len(d) == len(a) return d
def add_closest(aname, bname): a, b = BedTool(aname), BedTool(bname) afields = a.field_count() c = a.closest(b, d=True) get_name = gen_get_name(b, afields) dbed = open(BedTool._tmp(), "w") # keep the name and distance seen_by_line = collections.defaultdict(list) for feat in c: key = "\t".join(feat[:afields]) seen_by_line[key].append([feat[-1], get_name(feat)]) for key, dist_names in seen_by_line.iteritems(): if len(dist_names) > 0: assert len(set([d[0] for d in dist_names])) == 1 names = ",".join(sorted(set(d[1] for d in dist_names))) new_line = "\t".join([key] + [names] + [dist_names[0][0]]) dbed.write(new_line + "\n") dbed.close() d = BedTool( assert len(d) == len(a) return d
def main(): tmpdir_obj = TemporaryDirectory(dir=project_temp_dir) tmpdir_path = Path( gencode_dir = Path( "/icgc/dkfzlsdf/analysis/hs_ontogeny/databases/gene_annotations") gencode19_gtf_appris_principal = ( "/icgc/dkfzlsdf/analysis/hs_ontogeny/databases/gene_annotations" "/") gencode19_gtf = ( "/icgc/dkfzlsdf/analysis/hs_ontogeny/databases/gene_annotations" "/") all_tss_area_bed = gencode_dir.joinpath('all-tss_slop-100000-5000.bed') strand_dtype = CategoricalDtype(['+', '-'], ordered=True) # %% compute tss area intersects, ~ 30s gencode_df = pd.read_csv(gencode19_gtf, sep='\t', header=None, comment='#', names=[ 'feat_chrom', 'source', 'feature', 'Start', 'End', 'score', 'feat_strand', 'frame', 'attribute' ], dtype={ 'feat_chrom': str, 'Start': 'i8', 'End': 'i8', 'feat_strand': strand_dtype }) tss_n_upstream = 100_000 tss_n_downstream = 5000 transcripts = gencode_df.query('feature == "transcript"').copy() on_plus_strand = transcripts['feat_strand'] == '+' transcripts['TSS'] = -1 transcripts['feat_start'] = -1 transcripts['feat_end'] = -1 transcripts['feat_class'] = 'TSS_area' transcripts = expand_gtf_attributes(transcripts) # custom slop transcripts.loc[on_plus_strand, 'TSS'] = transcripts.loc[on_plus_strand, 'Start'] transcripts.loc[~on_plus_strand, 'TSS'] = transcripts.loc[~on_plus_strand, 'End'] transcripts.loc[on_plus_strand, 'feat_start'] = transcripts.loc[on_plus_strand, 'Start'] - tss_n_upstream transcripts.loc[on_plus_strand, 'feat_end'] = transcripts.loc[on_plus_strand, 'Start'] + tss_n_downstream transcripts.loc[~on_plus_strand, 'feat_start'] = transcripts.loc[~on_plus_strand, 'End'] - tss_n_downstream transcripts.loc[~on_plus_strand, 'feat_end'] = transcripts.loc[~on_plus_strand, 'End'] + tss_n_upstream transcripts = transcripts.sort_values( ['feat_chrom', 'feat_start', 'feat_end', 'TSS']) transcripts.loc[transcripts['feat_start'].lt(0), 'feat_start'] = 0 transcripts_cols = [ 'feat_chrom', 'feat_start', 'feat_end', 'TSS', 'feat_strand', 'feat_class', 'gene_name', 'gene_id', 'transcript_id', 'appris_principal_score' ] transcripts[transcripts_cols].to_csv(all_tss_area_bed, sep='\t', header=False, index=False) all_tss_area_bt = BedTool(str(all_tss_area_bed)) merged_dmrs_bt = BedTool(str(merged_dmrs_bed)) tss_intersect_bt = merged_dmrs_bt.intersect(all_tss_area_bt, wa=True, wb=True) tss_intersect_df = pd.read_csv( tss_intersect_bt.fn, sep='\t', names=['Chromosome', 'Start', 'End', 'region_id'] + transcripts_cols) tss_intersect_df['perc_feature'] = np.nan tss_intersect_df['perc_region'] = np.nan tss_intersect_df['distance'] = -1e8 tss_intersect_df['center'] = tss_intersect_df.eval( 'Start + (End - Start)/2') tss_intersect_df['feat_center'] = np.nan tss_intersect_df['has_center'] = False tss_intersect_df['distance'] = tss_intersect_df.eval('center - TSS') assert tss_intersect_df['distance'].ne(-1e8).all() # tss_intersect_df.loc[tss_intersect_df.eval('Start <= TSS <= End'), 'distance'] = 0 # tss_intersect_df.loc[tss_intersect_df.eval('End < TSS'), 'distance'] = tss_intersect_df.eval('End - TSS') # tss_intersect_df.loc[tss_intersect_df.eval('Start > TSS'), 'distance'] = tss_intersect_df.eval('Start - TSS') full_cols = [ 'Chromosome', 'Start', 'End', 'region_id', 'center', 'feat_class', 'perc_feature', 'perc_region', 'distance', 'has_center', 'gene_name', 'gene_id', 'transcript_id', 'appris_principal_score', 'feat_chrom', 'feat_start', 'feat_end', 'feat_center', 'feat_strand' ] tss_intersect_df_full = tss_intersect_df[full_cols] # %% compute exon, intron overlap, ~45s transcript_parts = gencode_df.loc[ ~gencode_df['feature'].isin(['gene', 'start_codon', 'stop_codon']), :] transcript_parts_fp = tmpdir_path.joinpath('transcript_parths.gtf') transcript_parts.to_csv(transcript_parts_fp, sep='\t', header=False, index=False) transcript_parts_bt = BedTool(str(transcript_parts_fp)) transcript_parts_anno = merged_dmrs_bt.intersect(transcript_parts_bt, wa=True, wb=True) transcript_parts_anno.head() transcript_parts_df = pd.read_csv(transcript_parts_anno.fn, sep='\t', header=None) transcript_parts_df.columns = [ 'Chromosome', 'Start', 'End', 'region_id' ] + [ 'feat_chrom', 'source', 'feat_class', 'feat_start', 'feat_end', 'score', 'feat_strand', 'frame', 'attribute' ] start = transcript_parts_df.eval('Start - feat_start').where( lambda ser:, 0) feat_size = transcript_parts_df.eval('feat_end - feat_start') end = transcript_parts_df.eval('End - feat_start').where( lambda ser:, feat_size) overlap_size = end - start region_size = transcript_parts_df.eval('End - Start') transcript_parts_df['center'] = transcript_parts_df.eval( 'Start + (End - Start)/2') transcript_parts_df['feat_center'] = transcript_parts_df.eval( 'feat_start + (feat_end - feat_start)/2') transcript_parts_df['distance'] = transcript_parts_df.eval( 'center - feat_center') transcript_parts_df['has_center'] = transcript_parts_df['distance'].lt( feat_size / 2) transcript_parts_df['perc_feature'] = overlap_size / feat_size transcript_parts_df['perc_region'] = overlap_size / region_size transcript_parts_df['distance'] = np.nan transcript_parts_df = expand_gtf_attributes(transcript_parts_df) transcript_parts_df_full = transcript_parts_df[full_cols] # %% classify into proximal and distal cis regulatory regions promoter_anno = tss_intersect_df_full.copy() is_proximal_promoter = promoter_anno.eval('-5000 <= distance <= 1000') is_distant_cis_regulatory_domain = promoter_anno.eval( '-20000 <= distance < -5000') promoter_anno['feat_class'] = np.nan promoter_anno.loc[is_proximal_promoter, 'feat_class'] = 'Promoter' promoter_anno.loc[is_proximal_promoter, 'has_center'] = True promoter_anno.loc[is_distant_cis_regulatory_domain, 'feat_class'] = 'UCRD' promoter_anno.loc[is_distant_cis_regulatory_domain, 'has_center'] = True promoter_anno = promoter_anno.loc[~promoter_anno['feat_class'].isna(), :] # %% concatenate and type casts full_annos = (pd.concat([promoter_anno, transcript_parts_df_full], axis=0).sort_values(['Chromosome', 'Start', 'End'])) precedence = pd.Series([ 'start_codon', 'stop_codon', 'Promoter', 'UTR', 'exon', 'CDS', 'UCRD', 'transcript', ]) # %% Filter according to precedence def filter_annotations(group_df): highest_class = precedence.iloc[precedence.isin( group_df['feat_class']).idxmax()] class_df = group_df.query('feat_class == @highest_class').sort_values( ['appris_principal_score', 'perc_region']) # TODO: sort by appris score, then by overlap if highest_class == 'transcript': class_df['feat_class'] = 'intron' if class_df['gene_name'].nunique() == 1: return class_df.iloc[[0], :] else: return class_df.groupby('gene_name', as_index=False).nth(0) center_annos = full_annos.loc[full_annos['has_center'], :] # filter, takes ~ # could maybe be sped up by removing more introns, perhaps with better intron annotation? t1 = time.time() filtered_annos = center_annos.groupby( 'region_id', group_keys=False).apply(filter_annotations) print(time.time() - t1) # cores = 24 # filtered_annos_l = Parallel(cores)(delayed(filter_annotations)(group_df) for unused_name, group_df in grouped) # filtered_annos = pd.concat(filtered_annos_l, axis=0) filtered_annos.to_pickle( results_dir.joinpath('filtered-annos_no-intergenic.p')) # filtered_annos = pd.read_pickle(results_dir.joinpath('filtered-annos_no-intergenic.p')) ids_annotated_regions = filtered_annos['region_id'].unique() merged_dmrs_df = pd.read_pickle(merged_dmrs_p) intergenic_regions = merged_dmrs_df.loc[ ~merged_dmrs_df['region_id'].isin(ids_annotated_regions), :].copy() intergenic_regions['feat_class'] = 'intergenic' # error: chromosome dtypes are different all_regions_annotated = (pd.concat([filtered_annos, intergenic_regions], sort=False, axis=0)) all_regions_annotated['Chromosome'] = all_regions_annotated[ 'Chromosome'].astype(str) all_regions_annotated.sort_values(['Chromosome', 'Start', 'End'], inplace=True) assert ( all_regions_annotated['region_id'].unique() == np.arange(53231)).all() all_regions_annotated['region_id'].value_counts().value_counts() all_regions_annotated['feat_class'].value_counts() all_regions_annotated.to_pickle(gencode_anno_p) all_regions_annotated.to_csv(gencode_anno_tsv, sep='\t', header=True, index=False) filtered_annos['feat_class'].value_counts() gencode_df_w_attributes = expand_gtf_attributes(gencode_df) principal_transcripts = gencode_df_w_attributes.query( 'appris_principal_score > 0 and feature == "transcript"').copy() # get TSS tss_on_plus_strand = principal_transcripts['feat_strand'].eq('+') principal_transcripts.loc[ tss_on_plus_strand, 'End'] = principal_transcripts.loc[tss_on_plus_strand, 'Start'] + 1 principal_transcripts.loc[ ~tss_on_plus_strand, 'Start'] = principal_transcripts.loc[~tss_on_plus_strand, 'End'] - 1 principal_transcripts = principal_transcripts.sort_values( ['feat_chrom', 'Start', 'End']) principal_transcripts_fp = tmpdir_path / 'principal-transcripts.gtf' principal_transcripts.iloc[:, 0:9].to_csv(principal_transcripts_fp, sep='\t', header=False, index=False) # pybedtools.featurefuncs.TSS has a bug gtf_princ_tss_bt = BedTool(str(principal_transcripts_fp)) closest_tss_bt = merged_dmrs_bt.closest(gtf_princ_tss_bt, D='b', fu=True, t='first') closest_tss_df = pd.read_csv(closest_tss_bt.fn, sep='\t', header=None) distances = closest_tss_df.iloc[:, -1] distances = distances.loc[(distances < 100_000) & (distances > -100_000)] import matplotlib as mpl from matplotlib.axes import Axes # for autocompletion in pycharm from matplotlib.figure import Figure # for autocompletion in pycharm mpl.use('Agg') # import before pyplot import! import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sns fig, ax = plt.subplots(1, 1) sns.distplot(distances.values, bins=1000, kde=False, ax=ax) fig.savefig(results_dir.joinpath('tss-distance-dist.png')) fig.savefig(results_dir.joinpath('tss-distance-dist.pdf'))
def get_transition_matrix_cluster(h3k27ac, h3k4me3, h3k27me3, diff_dict, cluster_index): intervals = BedTool( "/srv/scratch/annashch/dmso/dp_gp/dpgp_diff_peaks_fold/" + str(cluster_index) + ".bed") intersection_h3k27ac = intervals.closest(h3k27ac, wao=True) intersection_h3k4me3 = intervals.closest(h3k4me3, wao=True) intersection_h3k27me3 = intervals.closest(h3k27me3, wao=True) results = dict() for i in range(len(intervals)): if i % 100 == 0: print(i) cur_interval = tuple(intersection_h3k27ac[i][0:3]) results[cur_interval] = dict() results[cur_interval]['dmso'] = [0, 0, 0] #h3k27ac, h3k4me3, h3k27me3 results[cur_interval]['control'] = [0, 0, 0] #determine which of 6 possible mark combinations is represented for DMSO & Control if intersection_h3k27ac[i][3].startswith('c'): #is the h3k27ac mark differential or stable? overlap = tuple(intersection_h3k27ac[i][3:6]) #is the peak up in dmso? if overlap in diff_dict['h3k27ac_up']: results[cur_interval]['dmso'][0] = 1 #is the peak up in control? elif overlap in diff_dict['h3k27ac_down']: results[cur_interval]['control'][0] = 1 else: results[cur_interval]['dmso'][0] = 1 results[cur_interval]['control'][0] = 1 if intersection_h3k4me3[i][3].startswith('c'): #is the h3k4me3 mark differential or stable? overlap = tuple(intersection_h3k4me3[i][3:6]) #is the peak up in dmso? if overlap in diff_dict['h3k4me3_up']: results[cur_interval]['dmso'][1] = 1 #is the peak up in control? elif overlap in diff_dict['h3k4me3_down']: results[cur_interval]['control'][1] = 1 else: results[cur_interval]['dmso'][1] = 1 results[cur_interval]['control'][1] = 1 if intersection_h3k27me3[i][3].startswith('c'): #is the h3k27me3 mark differential or stable? overlap = tuple(intersection_h3k27me3[i][3:6]) #is the peak up in dmso? if overlap in diff_dict['h3k27me3_up']: results[cur_interval]['dmso'][2] = 1 #is the peak up in control? elif overlap in diff_dict['h3k27me3_down']: results[cur_interval]['control'][2] = 1 else: results[cur_interval]['dmso'][2] = 1 results[cur_interval]['control'][2] = 1 print("completed interval labels") #aggregate results into matrix transition_mat = dict() start_states = set([]) end_states = set([]) for interval in results: start_state = tuple(results[interval]['control']) end_state = tuple(results[interval]['dmso']) start_states.add(start_state) end_states.add(end_state) if start_state not in transition_mat: transition_mat[start_state] = dict() if end_state not in transition_mat[start_state]: transition_mat[start_state][end_state] = 1 else: transition_mat[start_state][end_state] += 1 outf = open("chipseq_transition_matrix_" + str(cluster_index) + ".txt", 'w') start_states = list(start_states) end_states = list(end_states) outf.write('\t' + '\t'.join([str(i) for i in end_states]) + '\n') for s in start_states: outf.write(str(s)) for e in end_states: if e in transition_mat[s]: outf.write('\t' + str(transition_mat[s][e])) else: outf.write('\t0') outf.write('\n')
def proximal(path1, path2, window_min, window_max, upstream=False, downstream=False, bins=None): """ This is the main function of Uses pybedtools closest function to find proximal coordinates Then calculates asymmetry through orientation function for proximal pairs # the flags it uses from here if bins==True then return not the counts but the lists of counts to bin them """ # Finds the occurrences within the proximity limits and saves their pairwise orientation. DataL1 = BedTool(path1).sort() DataL2 = BedTool(path2).sort() if upstream == downstream and upstream == True: closest = DataL1.closest(DataL2, D='ref') elif upstream is True: closest = DataL1.closest(DataL2, D='ref', id=False, iu=True) elif downstream is True: closest = DataL1.closest(DataL2, D='ref', iu=False, id=True) else: closest = DataL1.closest(DataL2, D='ref') closest_df = closest.to_dataframe() Strand1_init = list(closest_df.iloc[:, 5]) Strand2_init = list(closest_df.iloc[:, 11]) Distance_init = [i for i in list(closest_df.iloc[:, -1])] Distance1_temp, Strand1, Strand2 = zip(*( (dist, strand1, strand2) for dist, strand1, strand2 in zip( Distance_init, Strand1_init, Strand2_init) if abs(dist) <= window_max and abs(dist) >= window_min and dist >= 0)) Distance2_temp, Strand1_temp, Strand2_temp = zip( *((dist, strand2, strand1) for dist, strand1, strand2 in zip( Distance_init, Strand1_init, Strand2_init) if abs(dist) <= window_max and abs(dist) >= window_min and dist < 0)) Distance = list(Distance1_temp) + list(Distance2_temp) Strand1 = list(Strand1) + list(Strand1_temp) Strand2 = list(Strand2) + list(Strand2_temp) p_p, m_m, p_m, m_p, same_strand, opposite_strand, convergent, divergent = orientation( Strand1, Strand2) # Calculate the distance distributions for all orientations Distances_orientations = get_distance_orientations(Distance, Strand1, Strand2, window_min, window_max) p_pL_bin = [] m_mL_bin = [] # Same orientation p_mL_bin = [] m_pL_bin = [] # Opposite orientation same_strandL_bin = [] opposite_strandL_bin = [] # Combined same / opposite orientations convergentL_bin = [] divergentL_bin = [] Bins = [] if bins is not None: # Performs the same analysis for each bin. Bins = binner(window_min, window_max, bins) for index, bin_i in enumerate(Bins): Strand1Bin = [] Strand2Bin = [] min_bin, max_bin = bin_i for k in range(len(Distance)): if Distance[k] >= min_bin and Distance[k] < max_bin: Strand1Bin.append(Strand1[k]) Strand2Bin.append(Strand2[k]) p_p_bin, m_m_bin, p_m_bin, m_p_bin, same_strand_bin, opposite_strand_bin, convergent_bin, divergent_bin = orientation( Strand1Bin, Strand2Bin) p_pL_bin.append(p_p_bin) m_mL_bin.append(m_m_bin) # Same orientation, per bin p_mL_bin.append(p_m_bin) m_pL_bin.append(m_p_bin) # Opposite orientation per bin same_strandL_bin.append(same_strand_bin) opposite_strandL_bin.append(opposite_strand_bin) convergentL_bin.append(convergent_bin) divergentL_bin.append(divergent_bin) return (Distances_orientations, p_p, m_m, p_m, m_p, same_strand, opposite_strand, convergent, divergent), (Bins, p_pL_bin, m_mL_bin, p_mL_bin, m_pL_bin, same_strandL_bin, opposite_strandL_bin, convergentL_bin, divergentL_bin)
#!/usr/bin/python """ Example from the manuscript to print the names of genes that are <5000 bp away from intergenic SNPs. See for the shell script equivalent. """ from pybedtools import BedTool snps = BedTool('../test/data/snps.bed.gz') genes = BedTool('../test/data/hg19.gff') intergenic_snps = (snps - genes) nearby = genes.closest(intergenic_snps, d=True, stream=True) for gene in nearby: if int(gene[-1]) < 5000: print
grch38gff='/home/drew/Desktop/IPyNB-Variant-Analysis/data/cuffcmp.combined.gtf' #snps = BedTool('snps.bed.gz') # [1] genes = BedTool(grch38gff) # [1] # In[ ]: get_ipython().run_cell_magic('bash', '', 'ln -P /home/drew/Desktop/IPyNB-Variant-Analysis/data\nln -P /media/drew/easystore/ReferenceGenomes/GCA_000001405.15_GRCh38_no_alt_analysis_set/\nln -P /media/drew/easystore/ReferenceGenomes/GRCh38/') # In[ ]: intergenic_snps = snps.subtract(genes) # [2] nearby = genes.closest(intergenic_snps, d=True, stream=True) # [2, 3] for gene in nearby: # [4] if int(gene[-1]) < 5000: # [4] print # [4] # In[ ]: get_ipython().run_cell_magic('bash', '', 'cd /media/drew/easystore/GoodCell-Resources/AnalysisBaseDir/GSA_Data/2018_07\npwd\nls -l */*vcf') # In[ ]:
def run(chipdir, refseq, filedir, DMSO, CA): TSS = (-200, 1000) a = BedTool(chipdir) b = a.closest(refseq, d=True) b.cut([9, 10, 11, 12, 13, 14, 21]).saveas(filedir + "/SRF_closest.bed") d = dict() with open(filedir + "/SRF_closest.bed") as F: for line in F: line = line.strip().split() chrom, start, stop = line[0:3] d[chrom + "\t" + start + "\t" + stop + "\t"] = "\t".join(line[3:]) outfile = open(filedir + "/SRF_closest.rmdup.bed", "w") for key in d: if "." not in key.split(): outfile.write(key + d[key] + "\n") outfile.close() # os.system("sort -k1,1 -k2,2n " + filedir + "/SRF_closest.rmdup.bed > " + filedir + "/SRF_closest.rmdup.sorted.bed") a = BedTool(filedir + "/SRF_closest.rmdup.bed") a.sort().saveas(filedir + "/SRF_closest.rmdup.sorted.bed") outfile = open(filedir + "/SRF.TSS.bed", "w") outfile2 = open(filedir + "/SRF.gene.bed", "w") with open(filedir + "/SRF_closest.rmdup.sorted.bed") as F: for line in F: chrom, start, stop, gene, number, strand, distance = line.strip().split() if int(stop) - int(start) > 2000 and int(distance) > 10000: if strand is "+": outfile.write(chrom + "\t" + str(int(start) + TSS[0]) + "\t" + str(int(start) + TSS[1]) + "\n") outfile2.write(chrom + "\t" + str(int(start) + TSS[1]) + "\t" + stop + "\n") else: outfile.write(chrom + "\t" + str(int(stop) - TSS[1]) + "\t" + str(int(stop) - TSS[0]) + "\n") outfile2.write(chrom + "\t" + start + "\t" + str(int(stop) - TSS[1]) + "\n") outfile.close() outfile2.close() a = BedTool(filedir + "/SRF.TSS.bed") a.sort().saveas(filedir + "/SRF.TSS.bed") a = BedTool(filedir + "/SRF.gene.bed") a.sort().saveas(filedir + "/SRF.gene.bed") TSS = filedir + "/SRF.TSS.bed" genes = filedir + "/SRF.gene.bed" os.system("bedtools map -a " + genes + " -b " + DMSO + " -c 4 -o sum > " + filedir + "/DMSO.genes.bed") os.system("bedtools map -a " + TSS + " -b " + DMSO + " -c 4 -o sum > " + filedir + "/DMSO.TSS.bed") os.system("bedtools map -a " + genes + " -b " + CA + " -c 4 -o sum > " + filedir + "/CA.genes.bed") os.system("bedtools map -a " + TSS + " -b " + CA + " -c 4 -o sum > " + filedir + "/CA.TSS.bed") TRx = list() TRy = list() expressionlist = list() with open(filedir + "/DMSO.genes.bed") as a, open(filedir + "/DMSO.TSS.bed") as b, open( filedir + "/CA.genes.bed" ) as c, open(filedir + "/CA.TSS.bed") as d: for line in a: bline = b.readline().strip().split()[-1] cline = c.readline().strip().split()[-1] dline = d.readline().strip().split()[-1] if line.strip().split()[-1] is ".": DMSOgene = 0.0 else: DMSOgene = float(line.strip().split()[-1]) if bline is ".": DMSOTSS = 0.0 else: DMSOTSS = float(bline) if cline is ".": CAgene = 0.0 else: CAgene = float(cline) if dline is ".": CATSS = 0.0 else: CATSS = float(dline) if DMSOgene == 0.0: TRx.append(0.0) else: TRx.append((DMSOTSS / DMSOgene)) if CAgene == 0.0: TRy.append(0.0) else: TRy.append((CATSS / CAgene)) expressionlist.append((np.log2(DMSOgene) + np.log2(CAgene)) / 2.0) F6 = plt.figure() ax1 = F6.add_subplot(111) xy = np.vstack([TRx, TRy]) z = gaussian_kde(xy)(xy) ax1.scatter(TRx, TRy, c=z, edgecolor="") # ax1.scatter(TRx2,TRy2,c='red',edgecolor="",s=expressionlist2) ax1.set_title("Pausing Index") ax1.set_ylabel("CA") ax1.set_xlabel("DMSO") ax1.get_xaxis().tick_bottom() ax1.get_yaxis().tick_left() # ax1.plot([0,1/slope1],[intercept1,1],color = 'r') ax1.set_xlim([0, 20]) ax1.set_ylim([0, 20]) ax1.plot([0, 50.0], [0, 50.0], color="k") # ax1.text(8,18, "Pearson = " + str(pearsons)[0:5]) # ax2 = F6.add_subplot(122) # ax2.plot(np.sort(cdf),np.linspace(0,1,len(cdf))) # ax2.plot(stats.norm.cdf(np.linspace(min(cdf),max(cdf)),0,np.var(cdf)),np.linspace(0,1,len(cdf))) plt.savefig(figuredir + "/PausingIndex.png")
peak_background = peak_background.sort() gene_background = gene_background.sort() return peak_background, gene_background import pdb for cluster in range(1, 7): #get the peaks & genes for the current cluster peak_bed = BedTool(str(cluster) + ".peaks.bed") gene_bed = BedTool(str(cluster) + ".genes.bed") #get the background peak_background, gene_background = get_background(cluster) #peak to gene closest, cur cluster peak_to_gene_foreground = [ int(str(i).strip().split('\t')[-1]) for i in peak_bed.closest(gene_bed, wao=True, d=True, t="first") ] #gene to peak closest, cur cluster gene_to_peak_foreground = [ int(str(i).strip().split('\t')[-1]) for i in gene_bed.closest(peak_bed, wao=True, d=True, t="first") ] #peak to gene closest, background peak_to_gene_background = [ int(str(i).strip().split('\t')[-1]) for i in peak_bed.closest(gene_background, wao=True, d=True, t="first") ] #gene to peak closest, background gene_to_peak_background = [ int(str(i).strip().split('\t')[-1]) for i in gene_bed.closest(peak_background, wao=True, d=True, t="first")
csv_writer.writerow(out) print("\n\nFile(s) generated:\n\t", fileOut) def chrom_format(gencode): return BedTool([list(j.replace('chr', '') for j in i) for i in gencode]) ### ANNOTATE ##### if not delly_format and gencode and len(bedpe_list) > 0: bedpe_bed = BedTool(bedpe_list) if not 'chr' in bedpe_list[0][0]: gencode = chrom_format(gencode) bedpe_gencode = bedpe_bed.closest(gencode, d=True) bedpe_bed2 = BedTool([i[3:6] + i[:] for i in bedpe_gencode[:]]) del (bedpe_gencode) bedpe_gencode = bedpe_bed2.closest(gencode, d=True) bedpe_list2 = [i[3:] for i in bedpe_gencode[:]] bedpe_header = header + [ 'chrom_gene1', 'start_gene1', 'end_gene1', 'name_gene1', 'strand_gene1', 'dist_gene1', 'chrom_gene2', 'start_gene2', 'end_gene2', 'name_gene2', 'strand_gene2', 'dist_gene2', 'fusion_gene' ] with open(fileOutAnno, 'wb') as wout: bedpe_writer = csv.writer(wout, delimiter="\t") max_distance = 50000 gene_list = list() for r in bedpe_list2: r = [i.replace('\r', '') for i in r]
def run(DMSO, Nutlin1, Nutlin3, P53, figuredir, file2dir): D = BedTool(DMSO) N1 = BedTool(Nutlin1) N3 = BedTool(Nutlin3) P = BedTool(P53).cut([0, 1, 2]) start = time.time() w1 = (D + P).saveas(file2dir + 'Wave1.bed') w1rand = BedTool([P[i] for i in rn.randint(0, len(P), len(w1))]).sort() w2 = (N1 + P - D).saveas(file2dir + 'Wave2.bed') w2rand = BedTool([P[i] for i in rn.randint(0, len(P), len(w2))]).sort() w3 = (N3 + P - N1 - D).saveas(file2dir + 'Wave3.bed') w3rand = BedTool([P[i] for i in rn.randint(0, len(P), len(w3))]).sort() # w1 = D+P # print rn.shuffle(list(P))[:len(w1)] # w1rand = BedTool(rn.shuffle(P)[:len(w1)]).sort() # w2 = N1+P-D # w2rand = BedTool(rn.shuffle(P)[:len(w2)]).sort() # w3 = N3+P-N1-D # w3rand = BedTool(rn.shuffle(P)[:len(w3)]).sort() end = time.time() print(end - start) a = w2.closest(w1, d=True) b = w2rand.closest(w1rand, d=True) c = w3.closest(w2, d=True) d = w3rand.closest(w2rand, d=True) w21 = list() w2r1r = list() w32 = list() w3r2r = list() for x in a: try: w21.append(math.log(float(x[-1]), 10)) except: w21.append(0) for x in b: try: w2r1r.append(math.log(float(x[-1]), 10)) except: w2r1r.append(0) for x in c: try: w32.append(math.log(float(x[-1]), 10)) except: w32.append(0) for x in d: try: w3r2r.append(math.log(float(x[-1]), 10)) except: w3r2r.append(0) print len(w21), len(w2r1r), len(w32), len(w3r2r) # w21 = [math.log(float(x[-1])) for x in a if float(x[-1]) != 0] # w2r1r = [math.log(float(x[-1])) for x in b if float(x[-1]) != 0] # w32 = [math.log(float(x[-1])) for x in c if float(x[-1]) != 0] # w3r2r = [math.log(float(x[-1])) for x in d if float(x[-1]) != 0] # print stats.ks_2samp(w21, w2r1r) # print stats.ks_2samp(w32, w3r2r) F = plt.figure() ax1 = F.add_subplot(221) ax1.set_title('Wave2 to Wave1 (pval: ' + str(stats.ks_2samp(w21, w2r1r)[1]) + ')') ax1.set_ylabel('Count') ax1.set_xlabel('Log 10 Distance (bp)') ax1.hist(w21, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='green') # ax1.set_xlim([0,500000]) # ax1.set_ylim([0,600]) # ax1.hist(w21,bins=np.arange(0, 18 + 0.2, 0.2)) # ax1.set_xscale('log') # ax2.F.add_subplot(222) # ax2.set_title('Wave2rand to Wave1rand') # ax2.set_ylabel('Count') # ax2.set_xlabel('Distance (bp)') ax1.hist(w2r1r, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='red') ax1.legend(['Observed', 'Expected'], loc='upper left') # ax2.set_xlim([0,500000]) # ax2.set_ylim([0,600]) # ax1.hist(w2r1r,bins=np.arange(0, 18 + 0.2, 0.2)) # ax2.set_xscale('log') ax2 = F.add_subplot(222) ax2.set_title('Cumulative distribution function') ax2.set_ylabel('CDF') ax2.set_xlabel('Log 10 Distance (bp)') # Use the histogram function to bin the data counts, bin_edges = np.histogram(w21, bins=np.arange(0, 18 + 0.2, 0.2), normed=True) counts_r, bin_edges_r = np.histogram(w2r1r, bins=np.arange(0, 18 + 0.2, 0.2), normed=True) # Now find the cdf cdf = np.cumsum(counts) cdf_r = np.cumsum(counts_r) # And finally plot the cdf plt.plot(bin_edges[1:], cdf, color='green') plt.plot(bin_edges_r[1:], cdf_r, color='red') ax2.legend(['Observed', 'Expected'], loc='upper left') ax3 = F.add_subplot(223) ax3.set_title('Wave3 to Wave2 (pval: ' + str(stats.ks_2samp(w32, w3r2r)[1]) + ')') ax3.set_ylabel('Count') ax3.set_xlabel('Log 10 Distance (bp)') ax3.hist(w32, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='green') # ax3.set_xlim([0,500000]) # ax3.set_ylim([0,3500]) # ax3.hist(w32,bins=np.arange(0, 18 + 0.2, 0.2)) # ax3.set_xscale('log') ax4 = F.add_subplot(224) ax4.set_title('Cumulative distribution function') ax4.set_ylabel('CDF') ax4.set_xlabel('Log 10 Distance (bp)') # Use the histogram function to bin the data counts, bin_edges = np.histogram(w32, bins=np.arange(0, 18 + 0.2, 0.2), normed=True) counts_r, bin_edges_r = np.histogram(w3r2r, bins=np.arange(0, 18 + 0.2, 0.2), normed=True) # Now find the cdf cdf = np.cumsum(counts) cdf_r = np.cumsum(counts_r) # And finally plot the cdf plt.plot(bin_edges[1:], cdf, color='green') plt.plot(bin_edges_r[1:], cdf_r, color='red') ax4.legend(['Observed', 'Expected'], loc='upper left') # ax4 = F.add_subplot(224) # ax4.set_title('Wave3rand to Wave2rand') # ax4.set_ylabel('Count') # ax4.set_xlabel('Distance (bp)') ax3.hist(w3r2r, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='red') ax3.legend(['Observed', 'Expected'], loc='upper left') # ax4.set_xlim([0,500000]) # ax4.set_ylim([0,3500]) # ax3.hist(w3r2r,bins=np.arange(0, 18 + 0.2, 0.2)) # ax4.set_xscale('log') plt.savefig(figuredir + 'Cluster_analysis.png', dpi=1200)