예제 #1
0
    def merge_overlapped_peaks(self, debug=False):
        if f_judge_debug(debug):
            import ipdb
            ipdb.set_trace()

        #self.binding_df = pd.io.parsers.read_csv(self.peak_file, sep="\t", header=None, compression = self.compression)
        #self.binding_df.columns = ['chr', 'start', 'end', 'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peakMax']
        #self.binding_df.sort(['chr', 'start', 'end', 'peakMax'], inplace = True)
        #self.binding_df.index = range(self.binding_df.shape[0])
        #if any(self.binding_df.peakMax < 0) or any( self.binding_df.end - self.binding_df.start < self.binding_df.peakMax ):
        #    logging.info('Wrong state of the Peak max position ')
        #    self.binding_df.peakMax = (self.binding_df.end.astype(float) - self.binding_df.start)/2
        #    self.binding_df.peakMax = self.binding_df.peakMax.astype(int)

        bed_obj = my.f_pd_to_bed_based_on_file(self.binding_df)
        bed_merge = bed_obj.merge(c=1, o='count')

        merged_bed = bed_merge.intersect(bed_obj, wao=True)

        merged_df = my.f_bed_to_pd2(merged_bed)
        merged_df.columns = [
            'chr', 'start', 'end', 'overlap_count', 'chr2', 'start2', 'end2',
            'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peakMax',
            'overlap'
        ]
        merged_df[
            'peak_max_new'] = merged_df.start2 + merged_df.peakMax - merged_df.start
        print merged_df.ix[merged_df.overlap_count == 2].head()

        assert merged_df.shape[0] == self.binding_df.shape[0], 'Merge error '
        assert all(merged_df.peak_max_new > 0), 'Merge error'
        self.binding_df = merged_df.ix[:, [
            'chr', 'start', 'end', 'name', 'col4', 'col5', 'col6', 'col7',
            'col8', 'peak_max_new'
        ]]
예제 #2
0
    def overlap_with_other_bed(self, other_bed, debug=True):
        if f_judge_debug(debug):
            import ipdb
            ipdb.set_trace()
        bed_data = my.f_pd_to_bed_based_on_file(self.binding_df)
        overlap_regions = bed_data.intersect(other_bed, wo=True)
        if overlap_regions.count() == 0:
            raise Exception('Empty overlap between features and vcf file')

        overlap_db = my.f_bed_to_pd2(overlap_regions).ix[:, [0, 1, 2, 3, 7]]
        print overlap_db.head()
        overlap_db.columns = ['chr', 'start', 'end', 'info', 'overlap_name']
        assert overlap_db.duplicated(
            'overlap_name').sum() == 0, 'Duplicated vcf positions'
        return overlap_db
예제 #3
0
    def merge_single_file(self, debug=False):
        if f_judge_debug(debug):
            import ipdb
            ipdb.set_trace()

        #self.binding_df = pd.io.parsers.read_csv(self.peak_file, sep="\t", header=None, compression = self.compression)
        #self.binding_df.columns = ['chr', 'start', 'end', 'name', 'col4', 'col5', 'col6', 'col7', 'col8', 'peakMax']
        #self.binding_df.sort(['chr', 'start', 'end', 'peakMax'], inplace = True)
        #self.binding_df.index = range(self.binding_df.shape[0])

        bed_obj = my.f_pd_to_bed_based_on_file(self.binding_df)
        bed_merge = bed_obj.merge(c=1, o='count')

        merged_df = my.f_bed_to_pd2(bed_merge)
        merged_df.columns = ['chr', 'start', 'end', 'overlap_count']
        self.binding_df = merged_df
예제 #4
0
    def bed_to_fastq(self, overlap_db, tmp_dir, debug=False):
        if f_judge_debug(debug):
            import ipdb
            ipdb.set_trace()

        overlap_db['fastq_start'] = overlap_db['start'].astype(
            int) + overlap_db['info'].astype(int) - 550
        overlap_db['fastq_end'] = overlap_db['start'].astype(
            int) + overlap_db['info'].astype(int) + 550
        overlap_db['name'] = overlap_db['overlap_name'] + '_' + overlap_db[
            'fastq_start'].map(str)
        fastq_bed = my.f_pd_to_bed_based_on_file(
            overlap_db.ix[:, ['chr', 'fastq_start', 'fastq_end', 'name']])
        #fastq_bed = peak_bed.slop(b = 550, genome='hg19')

        hg19_file = my.f_get_reference_genome()
        fasta = pybedtools.example_filename(hg19_file)
        a = fastq_bed.sequence(fi=fasta, name=True)
        fasta_file = os.path.join(tmp_dir, 'infile.vcf.wt1100.fasta')
        import shutil
        shutil.copyfile(a.seqfn, fasta_file)
        return fasta_file