def merge_overlapped_peaks(self, debug=False): if f_judge_debug(debug): import ipdb ipdb.set_trace() #self.binding_df = pd.io.parsers.read_csv(self.peak_file, sep="\t", header=None, compression = self.compression) #self.binding_df.columns = ['chr', 'start', 'end', 'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peakMax'] #self.binding_df.sort(['chr', 'start', 'end', 'peakMax'], inplace = True) #self.binding_df.index = range(self.binding_df.shape[0]) #if any(self.binding_df.peakMax < 0) or any( self.binding_df.end - self.binding_df.start < self.binding_df.peakMax ): # logging.info('Wrong state of the Peak max position ') # self.binding_df.peakMax = (self.binding_df.end.astype(float) - self.binding_df.start)/2 # self.binding_df.peakMax = self.binding_df.peakMax.astype(int) bed_obj = my.f_pd_to_bed_based_on_file(self.binding_df) bed_merge = bed_obj.merge(c=1, o='count') merged_bed = bed_merge.intersect(bed_obj, wao=True) merged_df = my.f_bed_to_pd2(merged_bed) merged_df.columns = [ 'chr', 'start', 'end', 'overlap_count', 'chr2', 'start2', 'end2', 'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peakMax', 'overlap' ] merged_df[ 'peak_max_new'] = merged_df.start2 + merged_df.peakMax - merged_df.start print merged_df.ix[merged_df.overlap_count == 2].head() assert merged_df.shape[0] == self.binding_df.shape[0], 'Merge error ' assert all(merged_df.peak_max_new > 0), 'Merge error' self.binding_df = merged_df.ix[:, [ 'chr', 'start', 'end', 'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peak_max_new' ]]
def overlap_with_other_bed(self, other_bed, debug=True): if f_judge_debug(debug): import ipdb ipdb.set_trace() bed_data = my.f_pd_to_bed_based_on_file(self.binding_df) overlap_regions = bed_data.intersect(other_bed, wo=True) if overlap_regions.count() == 0: raise Exception('Empty overlap between features and vcf file') overlap_db = my.f_bed_to_pd2(overlap_regions).ix[:, [0, 1, 2, 3, 7]] print overlap_db.head() overlap_db.columns = ['chr', 'start', 'end', 'info', 'overlap_name'] assert overlap_db.duplicated( 'overlap_name').sum() == 0, 'Duplicated vcf positions' return overlap_db
def merge_single_file(self, debug=False): if f_judge_debug(debug): import ipdb ipdb.set_trace() #self.binding_df = pd.io.parsers.read_csv(self.peak_file, sep="\t", header=None, compression = self.compression) #self.binding_df.columns = ['chr', 'start', 'end', 'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peakMax'] #self.binding_df.sort(['chr', 'start', 'end', 'peakMax'], inplace = True) #self.binding_df.index = range(self.binding_df.shape[0]) bed_obj = my.f_pd_to_bed_based_on_file(self.binding_df) bed_merge = bed_obj.merge(c=1, o='count') merged_df = my.f_bed_to_pd2(bed_merge) merged_df.columns = ['chr', 'start', 'end', 'overlap_count'] self.binding_df = merged_df
def bed_to_fastq(self, overlap_db, tmp_dir, debug=False): if f_judge_debug(debug): import ipdb ipdb.set_trace() overlap_db['fastq_start'] = overlap_db['start'].astype( int) + overlap_db['info'].astype(int) - 550 overlap_db['fastq_end'] = overlap_db['start'].astype( int) + overlap_db['info'].astype(int) + 550 overlap_db['name'] = overlap_db['overlap_name'] + '_' + overlap_db[ 'fastq_start'].map(str) fastq_bed = my.f_pd_to_bed_based_on_file( overlap_db.ix[:, ['chr', 'fastq_start', 'fastq_end', 'name']]) #fastq_bed = peak_bed.slop(b = 550, genome='hg19') hg19_file = my.f_get_reference_genome() fasta = pybedtools.example_filename(hg19_file) a = fastq_bed.sequence(fi=fasta, name=True) fasta_file = os.path.join(tmp_dir, 'infile.vcf.wt1100.fasta') import shutil shutil.copyfile(a.seqfn, fasta_file) return fasta_file