예제 #1
0
파일: boxplot.py 프로젝트: eggduzao/reg-gen
 def bedCoverage(self):
     """ Return coverage matrix of multiple reads on one bed.
     bed --> GenomicRegionSet
     """
     c = []
     for rp in self.reads:
         print("    processing: ..." + rp[-45:])
         r = os.path.abspath(rp)  # Here change the relative path into absolute path
         cov = CoverageSet(r, self.all_bed)
         cov.coverage_from_genomicset(r)
         cov.normRPM()
         c.append(cov.coverage)
     self.all_table = numpy.transpose(c)
예제 #2
0
def norm_gene_level(bams, bed, name, verbose, folder, report):
    """Normalize bam files on a gene level. Give out list of normalization factors."""
    m = get_experimental_matrix(bams, bed)
    d = zip(m.types, m.names)
    d = map(lambda x: x[1], filter(lambda x: x[0] == 'reads',
                                   d))  #list of names which are reads

    regions = m.objectsDict[
        'housekeep']  #GenomicRegionSet containing housekeeping genes

    covs = []
    for cond in d:
        bam_path = m.files[cond]
        c = CoverageSet(cond, regions)
        c.coverage_from_bam(bam_file=bam_path)
        c.genomicRegions.sort()
        covs.append(c)

    #create matrix sample x gene for signal
    signals = [[
        sum(covs[k].coverage[i]) + 1
        for i in range(len(covs[k].genomicRegions))
    ] for k in range(len(covs))]
    assert len(covs) > 0
    gene_names = [
        covs[0].genomicRegions[i].name
        for i in range(len(covs[0].genomicRegions))
    ]

    colnames = gene_names
    d = np.matrix(signals, dtype=float)
    samples = map(lambda x: os.path.splitext(os.path.basename(x))[0], bams)
    #print("samples: %s" %",".join(map(lambda x: os.path.splitext(os.path.basename(x))[0], bams)))
    if verbose:
        print("-Housekeeping gene matrix (columns-genes, rows-samples)",
              file=sys.stderr)
        print(d, file=sys.stderr)
        print("", file=sys.stderr)

    if verbose or report:
        #output R code to check wether gene give same signal
        get_factor_matrix(d, colnames, folder, samples, verbose, report)
        #output_R_file(name, res, colnames)

    #print("factors")
    return get_factors(d), samples
예제 #3
0
def norm_gene_level(bams, bed, name, verbose):
    """Normalize bam files on a gene level. Give out list of normalization factors."""
    m = get_experimental_matrix(bams, bed)
    
    d = zip(m.types, m.names)
    d = map(lambda x: x[1], filter(lambda x: x[0] == 'reads', d)) #list of names which are reads
    
    regions = m.objectsDict['housekeep'] #GenomicRegionSet containing housekeeping genes
         
    covs = []
    for cond in d:
        bam_path = m.files[cond]
        c = CoverageSet(cond, regions) 
        c.coverage_from_bam(bam_file=bam_path)
        c.genomicRegions.sort()
        covs.append(c)
    
    #create matrix sample x gene for signal
    signals = [[sum(covs[k].coverage[i])+1 for i in range(len(covs[k].genomicRegions))] for k in range(len(covs))]
    assert len(covs) > 0
    gene_names = [covs[0].genomicRegions[i].name for i in range(len(covs[0].genomicRegions))]
    
    colnames = gene_names
    d = np.matrix(signals, dtype=float)
    samples = map(lambda x: os.path.splitext(os.path.basename(x))[0], bams)
    #print("samples: %s" %",".join(map(lambda x: os.path.splitext(os.path.basename(x))[0], bams)))
    if verbose:
        print("Housekeeping gene matrix (columns-genes, rows-samples)")
        print(d)
        print("")
    
    if verbose:
        #output R code to check wether gene give same signal
        get_factor_matrix(d, colnames)
        #output_R_file(name, res, colnames)
    
    #print("factors")
    return get_factors(d), samples
예제 #4
0
 def bedCoverage(self):
     """ Return coverage matrix of multiple reads on one bed.
     bed --> GenomicRegionSet
     """
     c = []
     for rp in self.reads:
         print("    processing: ..." + rp[-45:])
         r = os.path.abspath(
             rp)  # Here change the relative path into absolute path
         cov = CoverageSet(r, self.all_bed)
         cov.coverage_from_genomicset(r)
         cov.normRPM()
         c.append(cov.coverage)
     self.all_table = numpy.transpose(c)
예제 #5
0
    def _help_init(self, path_bamfiles, exts, rmdup, binsize, stepsize,
                   path_inputs, exts_inputs, dim, regions, norm_regionset,
                   strand_cov):
        """Return self.covs and self.inputs as CoverageSet"""
        self.exts = exts
        self.covs = [CoverageSet('file' + str(i), regions) for i in range(dim)]
        for i, c in enumerate(self.covs):
            c.coverage_from_bam(bam_file=path_bamfiles[i], extension_size=exts[i], rmdup=rmdup, binsize=binsize,\
                                stepsize=stepsize, get_strand_info = strand_cov)
        self.covs_avg = [
            CoverageSet('cov_avg' + str(i), regions) for i in range(2)
        ]
        if path_inputs:
            self.inputs = [
                CoverageSet('input' + str(i), regions)
                for i in range(len(path_inputs))
            ]
            for i, c in enumerate(self.inputs):
                c.coverage_from_bam(bam_file=path_inputs[i], extension_size=exts_inputs[i], rmdup=rmdup, binsize=binsize,\
                                stepsize=stepsize, get_strand_info = strand_cov)
            self.input_avg = [
                CoverageSet('input_avg' + str(i), regions) for i in range(2)
            ]
        else:
            self.inputs = []

        if norm_regionset:
            self.norm_regions = [
                CoverageSet('norm_region' + str(i), norm_regionset)
                for i in range(dim)
            ]
            for i, c in enumerate(self.norm_regions):
                c.coverage_from_bam(bam_file=path_bamfiles[i], extension_size=exts[i], rmdup=rmdup, binsize=binsize,\
                                    stepsize=stepsize, get_strand_info = strand_cov)
            self.input_avg = [
                CoverageSet('input_avg' + str(i), regions) for i in range(2)
            ]
        else:
            self.norm_regions = None
예제 #6
0
    def __init__(self, name, region, genome_path, binsize, stepsize, rmdup, file_1, ext_1, file_2, ext_2,\
                 input_1, ext_input_1, input_factor_1, input_2, ext_input_2, input_factor_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\
                 factor_input_1, factor_input_2, chrom_sizes_dict, debug, tracker):
        self.genomicRegions = region
        self.binsize = binsize
        self.stepsize = stepsize
        self.name = name
        self.cov1 = CoverageSet('first file', region)
        self.cov2 = CoverageSet('second file', region)

        print("Loading reads...", file=sys.stderr)
        self.cov1.coverage_from_bam(bam_file=file_1,
                                    read_size=ext_1,
                                    rmdup=rmdup,
                                    binsize=binsize,
                                    stepsize=stepsize,
                                    mask_file=deadzones)
        self.cov2.coverage_from_bam(bam_file=file_2,
                                    read_size=ext_2,
                                    rmdup=rmdup,
                                    binsize=binsize,
                                    stepsize=stepsize,
                                    mask_file=deadzones)

        map_input = {
            1: {
                'input': input_1,
                'input_factor': input_factor_1,
                'ext': ext_input_1,
                'cov-ip': self.cov1,
                'ip': file_1
            },
            2: {
                'input': input_2,
                'input_factor': input_factor_2,
                'ext': ext_input_2,
                'cov-ip': self.cov2,
                'ip': file_2
            }
        }

        if not no_gc_content and input_1 is not None and input_2 is not None:
            print("Computing GC content", file=sys.stderr)
        else:
            print("Do not compute GC content", file=sys.stderr)

        for i in [1, 2]:
            input = map_input[i]
            name_bam, name_input = self._get_BAM_names(input['input'],
                                                       input['ip'])

            if debug:  #0: output raw IP
                input['cov-ip'].write_bigwig(
                    name + '-debug-0-' + name_bam + '.bw', chrom_sizes)

            if input['input'] is not None:
                input['cov-input'] = CoverageSet('%s file' % input['input'],
                                                 region)
                input['cov-input'].coverage_from_bam(bam_file=input['input'],
                                                     read_size=input['ext'],
                                                     rmdup=rmdup,
                                                     binsize=binsize,
                                                     stepsize=stepsize)
                map_input[i]['cov-input'] = input['cov-input']

            if not no_gc_content and input['input'] is not None:
                gc_content_cov, avg_gc_content, gc_hist = get_gc_context(
                    stepsize, binsize, genome_path,
                    input['cov-input'].coverage, chrom_sizes_dict)

                self._norm_gc_content(input['cov-ip'].coverage, gc_content_cov,
                                      avg_gc_content)
                self._norm_gc_content(input['cov-input'].coverage,
                                      gc_content_cov, avg_gc_content)

                if debug:  #1: output after GC
                    self.print_gc_hist(name + '-' + name_input,
                                       gc_hist)  #print hist data
                    input['cov-input'].write_bigwig(
                        name + '-debug-1-' + name_input + '.bw', chrom_sizes)
                    input['cov-ip'].write_bigwig(
                        name + '-debug-1-' + name_bam + '.bw', chrom_sizes)

        norm_done = False
        print("Normalizing signals", file=sys.stderr)
        for i in [1, 2]:
            input = map_input[i]
            name_bam, name_input = self._get_BAM_names(input['input'],
                                                       input['ip'])

            #TODO: uncomment here!
            norm_done = self.normalization(map_input, i, norm_strategy,
                                           norm_done, name, debug,
                                           factor_input_1, factor_input_2,
                                           chrom_sizes_dict, tracker)

            if input['input'] is not None:
                input['cov-input'].write_bigwig(
                    name + '-' + name_input + '-normalized.bw', chrom_sizes)
            input['cov-ip'].write_bigwig(
                name + '-' + name_bam + '-normalized.bw', chrom_sizes)

        #make one array for the coverage
        self.first_overall_coverage = reduce(
            lambda x, y: np.concatenate((x, y)), [
                self.cov1.coverage[i]
                for i in range(len(self.cov1.genomicRegions))
            ])
        self.second_overall_coverage = reduce(
            lambda x, y: np.concatenate((x, y)), [
                self.cov2.coverage[i]
                for i in range(len(self.cov2.genomicRegions))
            ])
        assert (len(self.first_overall_coverage) == len(
            self.second_overall_coverage))

        self.scores = np.zeros(len(self.first_overall_coverage))
        self.indices_of_interest = []
예제 #7
0
class DualCoverageSet():
    def _get_BAM_names(self, input, ip):
        #get names of BAM files for bw files
        name_bam = path.splitext(path.basename(ip))[0]
        if input is not None:
            name_input = path.splitext(path.basename(input))[0]
        else:
            name_input = None

        return name_bam, name_input

    def __init__(self, name, region, genome_path, binsize, stepsize, rmdup, file_1, ext_1, file_2, ext_2,\
                 input_1, ext_input_1, input_factor_1, input_2, ext_input_2, input_factor_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\
                 factor_input_1, factor_input_2, chrom_sizes_dict, debug, tracker):
        self.genomicRegions = region
        self.binsize = binsize
        self.stepsize = stepsize
        self.name = name
        self.cov1 = CoverageSet('first file', region)
        self.cov2 = CoverageSet('second file', region)

        print("Loading reads...", file=sys.stderr)
        self.cov1.coverage_from_bam(bam_file=file_1,
                                    read_size=ext_1,
                                    rmdup=rmdup,
                                    binsize=binsize,
                                    stepsize=stepsize,
                                    mask_file=deadzones)
        self.cov2.coverage_from_bam(bam_file=file_2,
                                    read_size=ext_2,
                                    rmdup=rmdup,
                                    binsize=binsize,
                                    stepsize=stepsize,
                                    mask_file=deadzones)

        map_input = {
            1: {
                'input': input_1,
                'input_factor': input_factor_1,
                'ext': ext_input_1,
                'cov-ip': self.cov1,
                'ip': file_1
            },
            2: {
                'input': input_2,
                'input_factor': input_factor_2,
                'ext': ext_input_2,
                'cov-ip': self.cov2,
                'ip': file_2
            }
        }

        if not no_gc_content and input_1 is not None and input_2 is not None:
            print("Computing GC content", file=sys.stderr)
        else:
            print("Do not compute GC content", file=sys.stderr)

        for i in [1, 2]:
            input = map_input[i]
            name_bam, name_input = self._get_BAM_names(input['input'],
                                                       input['ip'])

            if debug:  #0: output raw IP
                input['cov-ip'].write_bigwig(
                    name + '-debug-0-' + name_bam + '.bw', chrom_sizes)

            if input['input'] is not None:
                input['cov-input'] = CoverageSet('%s file' % input['input'],
                                                 region)
                input['cov-input'].coverage_from_bam(bam_file=input['input'],
                                                     read_size=input['ext'],
                                                     rmdup=rmdup,
                                                     binsize=binsize,
                                                     stepsize=stepsize)
                map_input[i]['cov-input'] = input['cov-input']

            if not no_gc_content and input['input'] is not None:
                gc_content_cov, avg_gc_content, gc_hist = get_gc_context(
                    stepsize, binsize, genome_path,
                    input['cov-input'].coverage, chrom_sizes_dict)

                self._norm_gc_content(input['cov-ip'].coverage, gc_content_cov,
                                      avg_gc_content)
                self._norm_gc_content(input['cov-input'].coverage,
                                      gc_content_cov, avg_gc_content)

                if debug:  #1: output after GC
                    self.print_gc_hist(name + '-' + name_input,
                                       gc_hist)  #print hist data
                    input['cov-input'].write_bigwig(
                        name + '-debug-1-' + name_input + '.bw', chrom_sizes)
                    input['cov-ip'].write_bigwig(
                        name + '-debug-1-' + name_bam + '.bw', chrom_sizes)

        norm_done = False
        print("Normalizing signals", file=sys.stderr)
        for i in [1, 2]:
            input = map_input[i]
            name_bam, name_input = self._get_BAM_names(input['input'],
                                                       input['ip'])

            #TODO: uncomment here!
            norm_done = self.normalization(map_input, i, norm_strategy,
                                           norm_done, name, debug,
                                           factor_input_1, factor_input_2,
                                           chrom_sizes_dict, tracker)

            if input['input'] is not None:
                input['cov-input'].write_bigwig(
                    name + '-' + name_input + '-normalized.bw', chrom_sizes)
            input['cov-ip'].write_bigwig(
                name + '-' + name_bam + '-normalized.bw', chrom_sizes)

        #make one array for the coverage
        self.first_overall_coverage = reduce(
            lambda x, y: np.concatenate((x, y)), [
                self.cov1.coverage[i]
                for i in range(len(self.cov1.genomicRegions))
            ])
        self.second_overall_coverage = reduce(
            lambda x, y: np.concatenate((x, y)), [
                self.cov2.coverage[i]
                for i in range(len(self.cov2.genomicRegions))
            ])
        assert (len(self.first_overall_coverage) == len(
            self.second_overall_coverage))

        self.scores = np.zeros(len(self.first_overall_coverage))
        self.indices_of_interest = []

    def normalization(self, map_input, i, norm_strategy, norm_done, name,
                      debug, factor_input_1, factor_input_2, chrom_sizes_dict,
                      tracker):
        input = map_input[i]

        #compute normalization factor
        #pre-defined values
        if input['input_factor'] is not None and i != 1:
            print("Normalize by Diaz and pre-defined values...",
                  input['input_factor'],
                  file=sys.stderr)
            print("Normalize file 1 with input normalization factor %s" %
                  (map_input[1]['input_factor']),
                  file=sys.stderr)
            print("Normalize file 2 with input normalization factor %s" %
                  (map_input[2]['input_factor']),
                  file=sys.stderr)
            tracker.write(text=str(map_input[1]['input_factor']) + ',' +
                          str(map_input[2]['input_factor']),
                          header="Predefined Normalization factor of Input")

            map_input[1]['cov-input'].scale(map_input[1]['input_factor'])
            map_input[2]['cov-input'].scale(map_input[2]['input_factor'])
            map_input[1]['cov-ip'].subtract(map_input[1]['cov-input'])
            map_input[2]['cov-ip'].subtract(map_input[2]['cov-input'])

        #naive norm.
        if not norm_done and norm_strategy == 1:
            if factor_input_1 is None or factor_input_2 is None:
                s1 = sum([
                    sum(map_input[1]['cov-ip'].coverage[i])
                    for i in range(len(map_input[1]['cov-ip'].genomicRegions))
                ])
                s2 = sum([
                    sum(map_input[2]['cov-ip'].coverage[i])
                    for i in range(len(map_input[2]['cov-ip'].genomicRegions))
                ])
                if s1 > s2:
                    map_input[2]['cov-ip'].scale(s1 / float(s2))
                    print(
                        "Normalize file 2 by signal with estimated factor %s "
                        % (round(s1 / float(s2), 3)),
                        file=sys.stderr)
                    tracker.write(text=str(round(s1 / float(s2), 3)),
                                  header="Normalization factor of signal 2")
                elif s2 >= s1:
                    print(
                        "Normalize file 1 by signal with estimated factor %s "
                        % (round(s2 / float(s1), 3)),
                        file=sys.stderr)
                    tracker.write(text=str(round(s2 / float(s1), 3)),
                                  header="Normalization factor of signal 1")
                    map_input[1]['cov-ip'].scale(s2 / float(s1))

                norm_done = True
            else:
                map_input[1]['cov-ip'].scale(factor_input_1)
                print("Normalize file 1 by signal with given factor %s " %
                      round(factor_input_1, 3),
                      file=sys.stderr)
                tracker.write(
                    text=str(round(factor_input_1, 3)),
                    header="Predefined Normalization factor of signal 1")

                map_input[2]['cov-ip'].scale(factor_input_2)
                print("Normalize file 2 by signal with given factor %s " %
                      round(factor_input_2, 3),
                      file=sys.stderr)
                tracker.write(
                    text=str(round(factor_input_2, 3)),
                    header="Predefined Normalization factor of signal 2")
                norm_done = True

        #diaz and naive
        if i != 1 and norm_strategy == 5:
            #apply diaz
            _, map_input[1]['input_factor'] = get_normalization_factor(map_input[1]['ip'], map_input[1]['input'], step_width=1000, zero_counts=0, \
                                                              filename=name + '-norm' + str(i), debug=debug, chrom_sizes_dict=chrom_sizes_dict, two_sample=False)
            _, map_input[2]['input_factor'] = get_normalization_factor(map_input[2]['ip'], map_input[2]['input'], step_width=1000, zero_counts=0, \
                                                              filename=name + '-norm' + str(i), debug=debug, chrom_sizes_dict=chrom_sizes_dict, two_sample=False)

            print("Normalize input with factor %s and %s" %
                  (round(map_input[1]['input_factor'],
                         3), round(map_input[2]['input_factor'], 3)),
                  file=sys.stderr)
            tracker.write(text=str(round(map_input[1]['input_factor'], 3)) +
                          ',' + str(round(map_input[2]['input_factor'], 3)),
                          header="Input Normalization factors")

            map_input[1]['cov-input'].scale(map_input[1]['input_factor'])
            map_input[2]['cov-input'].scale(map_input[2]['input_factor'])

            map_input[1]['cov-ip'].subtract(map_input[1]['cov-input'])
            map_input[2]['cov-ip'].subtract(map_input[2]['cov-input'])

            if factor_input_1 is None or factor_input_2 is None:
                #apply naive method
                s1 = sum([
                    sum(map_input[1]['cov-ip'].coverage[i])
                    for i in range(len(map_input[1]['cov-ip'].genomicRegions))
                ])
                s2 = sum([
                    sum(map_input[2]['cov-ip'].coverage[i])
                    for i in range(len(map_input[2]['cov-ip'].genomicRegions))
                ])

                if s1 > s2:
                    map_input[2]['cov-ip'].scale(s1 / float(s2))
                    print(
                        "Normalize file 2 by signal with estimated factor %s "
                        % (round(s1 / float(s2), 3)),
                        file=sys.stderr)
                    tracker.write(text=str(round(s1 / float(s2), 3)),
                                  header="Normalization factor of signal 2")
                elif s2 >= s1:
                    print(
                        "Normalize file 1 by signal with estimated factor %s "
                        % (round(s2 / float(s1), 3)),
                        file=sys.stderr)
                    map_input[1]['cov-ip'].scale(s2 / float(s1))
                    tracker.write(text=str(round(s2 / float(s1), 3)),
                                  header="Normalization factor of signal 1")
            else:
                map_input[1]['cov-ip'].scale(factor_input_1)
                print("Normalize file 1 by signal with given factor %s " %
                      round(factor_input_1, 3),
                      file=sys.stderr)
                tracker.write(text=str(round(factor_input_1, 3)),
                              header="Normalization factor of signal 1")
                map_input[2]['cov-ip'].scale(factor_input_2)
                print("Normalize file 2 by signal with given factor %s " %
                      round(factor_input_2, 3),
                      file=sys.stderr)
                tracker.write(text=str(round(factor_input_2, 3)),
                              header="Normalization factor of signal 2")
        return norm_done

    def print_gc_hist(self, name, gc_hist):
        f = open(name + '-gc-content.data', 'w')
        for i in range(len(gc_hist)):
            print(i, gc_hist[i], file=f)
        f.close()

    def _norm_gc_content(self, cov, gc_cov, gc_avg):
        for i in range(len(cov)):
            assert len(cov[i]) == len(gc_cov[i])
            #            cov[i] = gc_cov[i]
            cov[i] = np.array(cov[i])
            gc_cov[i] = np.array(gc_cov[i])
            gc_cov[i][
                gc_cov[i] <
                EPSILON] = gc_avg  #sometimes zeros occur, do not consider
            cov[i] = cov[i] * gc_avg / gc_cov[i]
            cov[i] = cov[i].clip(0, max(max(cov[i]), 0))  #neg. values to 0
            cov[i] = cov[i].astype(int)

    def _index2coordinates(self, index):
        """Translate index within coverage array to genomic coordinates."""
        iter = self.genomicRegions.__iter__()
        r = iter.next()
        sum = r.final
        last = 0
        i = 0
        while sum <= index * self.stepsize:
            last += len(self.cov1.coverage[i])
            try:
                r = iter.next()
            except StopIteration:
                sum += r.final
                i += 1
                break
            sum += r.final
            i += 1

        return r.chrom, (index-last) * self.stepsize, \
            min((index-last) * self.stepsize + self.stepsize, r.final)

    def __len__(self):
        """Return number of observations."""
        return len(self.indices_of_interest)

    def get_observation(self, mask=np.array([])):
        """Return indices of observations. Do not consider indices contained in <mask> array"""
        if not mask.size:
            mask = np.array([True] * len(self.first_overall_coverage))
        return np.array([
            self.first_overall_coverage[mask],
            self.second_overall_coverage[mask]
        ]).T

    def _compute_score(self):
        """Compute score for each observation (based on Xu et al.)"""
        self.scores = self.first_overall_coverage / float(sum(self.first_overall_coverage)) + \
                        self.second_overall_coverage / float(sum(self.second_overall_coverage))

    def compute_putative_region_index(self, l=5):
        """Compute putative differential peak regions as follows: 
        - score must be > 2/(m*n) (m=#obs, n=0.9 (default) )
        - overall coverage in library 1 and 2 must be > 3
        - extend resulting sites by l steps in both directions. """
        m = len(self.first_overall_coverage)
        n = 0.9
        self._compute_score()
        #        print('before filter step:', len(self.scores), file=sys.stderr)
        #print(self.first_overall_coverage, self.second_overall_coverage, file=sys.stderr)
        self.indices_of_interest = np.where(self.scores > 2 / (m * n))[0]
        #        print('after first filter step: ', len(self.indices_of_interest), file=sys.stderr)
        tmp = np.where(
            self.first_overall_coverage + self.second_overall_coverage > 3)[0]
        tmp2 = np.intersect1d(self.indices_of_interest, tmp)
        #        print('length of intersection set: ', len(tmp), file=sys.stderr)
        self.indices_of_interest = tmp2
        #        print('after second filter step: ', len(self.indices_of_interest), file=sys.stderr)
        #extend regions by l steps
        tmp = set()
        for i in self.indices_of_interest:
            for j in range(max(0, i - l), i + l + 1):
                tmp.add(j)
        tmp = list(tmp)
        tmp.sort()
        self.indices_of_interest = np.array(tmp)

    def get_initial_dist(self, filename):
        """Write BED file with initial state distribution"""
        states = []
        threshold = 2.0
        for i in self.indices_of_interest:
            c1 = self.first_overall_coverage[i]
            c2 = self.second_overall_coverage[i]

            if c1 + c2 <= 3:
                state = 0
            elif c1 / max(float(c2), 1) > threshold or c1 - c2 > 10:
                state = 1
            elif c1 / max(float(c2), 1) < 1 / threshold or c2 - c1 > 10:
                state = 2
            else:
                state = 0

            states.append(state)

        f = open(filename, 'w')
        for j in range(len(states)):
            i = self.indices_of_interest[j]
            chrom, start, end = self._index2coordinates(i)
            s = states[j]
            print(chrom,
                  start,
                  end,
                  s,
                  self.first_overall_coverage[i],
                  self.second_overall_coverage[i],
                  sep='\t',
                  file=f)

        f.close()

    def write_putative_regions(self, path):
        """Write putative regions (defined by criteria mentioned in method) as BED file."""
        with open(path, 'w') as f:
            for i in self.indices_of_interest:
                chrom, start, end = self._index2coordinates(i)
                print(chrom, start, end, file=f)

    def get_training_set(self, exp_data, x, verbose, name, debug,
                         constraint_chrom):
        """Return linked genomic positions (at least <x> positions) to train HMM.
        Grep randomly a position within a putative region, and take then the entire region."""
        training_set = set()
        ts1 = set()
        ts2 = set()
        threshold = 2.0
        diff_cov = 10

        if constraint_chrom is not None:
            print("HMM training set based on %s" % constraint_chrom,
                  file=sys.stderr)

        for i in range(len(self.indices_of_interest)):
            chrom, start, end = self._index2coordinates(i)
            if constraint_chrom is not None and chrom != constraint_chrom:
                continue
            cov1 = exp_data.first_overall_coverage[self.indices_of_interest[i]]
            cov2 = exp_data.second_overall_coverage[
                self.indices_of_interest[i]]

            if cov1 / max(float(cov2),
                          1) > threshold or cov1 - cov2 > diff_cov:
                ts1.add(i)
            if cov1 / max(float(cov2),
                          1) < 1 / threshold or cov2 - cov1 > diff_cov:
                ts2.add(i)

        l = min(min(len(ts1), len(ts2)), x)
        tmp = set(sample(ts1, l)) | set(sample(ts2, l))

        for i in tmp:
            training_set.add(self.indices_of_interest[i])
            #search up
            while i + 1 < len(
                    self.indices_of_interest) and self.indices_of_interest[
                        i + 1] == self.indices_of_interest[i] + 1:
                training_set.add(self.indices_of_interest[i + 1])
                i += 1
            #search down
            while i - 1 > 0 and self.indices_of_interest[
                    i - 1] == self.indices_of_interest[i] - 1:
                training_set.add(self.indices_of_interest[i - 1])
                i -= 1

#        while len(training_set) < x:
#            i = randrange(1, len(self.indices_of_interest)-1)
#
#            if i in used:
#                continue #todo: super ugly
#            used.add(i)
#            training_set.add(self.indices_of_interest[i])
#            #search up
#            while i+1 < len(self.indices_of_interest) and self.indices_of_interest[i+1] == self.indices_of_interest[i]+1:
#                training_set.add(self.indices_of_interest[i+1])
#                i += 1
#            #search down
#            while i-1 > 0 and self.indices_of_interest[i-1] == self.indices_of_interest[i]-1:
#                training_set.add(self.indices_of_interest[i-1])
#                i -= 1

        training_set = list(training_set)
        training_set.sort()
        if debug:
            f = open(name + '-trainingset.bed', 'w')
            for l in training_set:
                chrom, s, e = self._index2coordinates(l)
                print(chrom, s, e, sep='\t', file=f)
            f.close()

        return np.array(training_set)
예제 #8
0
 def __init__(self, name, region, genome_path, binsize, stepsize, rmdup, file_1, ext_1, file_2, ext_2,\
              input_1, ext_input_1, input_factor_1, input_2, ext_input_2, input_factor_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\
              factor_input_1, factor_input_2, chrom_sizes_dict, debug, tracker):
     self.genomicRegions = region
     self.binsize = binsize
     self.stepsize = stepsize
     self.name = name
     self.cov1 = CoverageSet('first file', region)
     self.cov2 = CoverageSet('second file', region)
     
     print("Loading reads...", file=sys.stderr)
     self.cov1.coverage_from_bam(bam_file=file_1, read_size=ext_1, rmdup=rmdup, binsize=binsize, stepsize=stepsize, mask_file=deadzones)
     self.cov2.coverage_from_bam(bam_file=file_2, read_size=ext_2, rmdup=rmdup, binsize=binsize, stepsize=stepsize, mask_file=deadzones)
     
     map_input = {1: {'input': input_1, 'input_factor': input_factor_1, 'ext': ext_input_1, 'cov-ip': self.cov1, 'ip': file_1}, 
                  2: {'input': input_2, 'input_factor': input_factor_2, 'ext': ext_input_2, 'cov-ip': self.cov2, 'ip': file_2}}
     
     if not no_gc_content and input_1 is not None and input_2 is not None:
         print("Computing GC content", file=sys.stderr)
     else:
         print("Do not compute GC content", file=sys.stderr)
     
     for i in [1, 2]:
         input = map_input[i]
         name_bam, name_input = self._get_BAM_names(input['input'], input['ip'])
         
         if debug: #0: output raw IP
             input['cov-ip'].write_bigwig(name + '-debug-0-' + name_bam + '.bw', chrom_sizes)
         
         if input['input'] is not None:
             input['cov-input'] = CoverageSet('%s file' %input['input'], region)
             input['cov-input'].coverage_from_bam(bam_file=input['input'], read_size=input['ext'], rmdup=rmdup, binsize=binsize, stepsize=stepsize)
             map_input[i]['cov-input'] = input['cov-input']
             
         
         if not no_gc_content and input['input'] is not None:
             gc_content_cov, avg_gc_content, gc_hist = get_gc_context(stepsize, binsize, genome_path, input['cov-input'].coverage, chrom_sizes_dict)
             
             self._norm_gc_content(input['cov-ip'].coverage, gc_content_cov, avg_gc_content)
             self._norm_gc_content(input['cov-input'].coverage, gc_content_cov, avg_gc_content)
             
             if debug: #1: output after GC
                 self.print_gc_hist(name + '-' + name_input, gc_hist) #print hist data
                 input['cov-input'].write_bigwig(name + '-debug-1-' + name_input + '.bw', chrom_sizes)
                 input['cov-ip'].write_bigwig(name + '-debug-1-' + name_bam + '.bw', chrom_sizes)
     
     
     norm_done = False
     print("Normalizing signals", file=sys.stderr)
     for i in [1, 2]:
         input = map_input[i]
         name_bam, name_input = self._get_BAM_names(input['input'], input['ip'])
         
         #TODO: uncomment here!
         norm_done = self.normalization(map_input, i, norm_strategy, norm_done, name, debug, factor_input_1, factor_input_2, chrom_sizes_dict, tracker)
         
         if input['input'] is not None:
             input['cov-input'].write_bigwig(name + '-' + name_input + '-normalized.bw', chrom_sizes)
         input['cov-ip'].write_bigwig(name + '-' + name_bam + '-normalized.bw', chrom_sizes)
             
     #make one array for the coverage
     self.first_overall_coverage = reduce(lambda x,y: np.concatenate((x,y)), [self.cov1.coverage[i] for i in range(len(self.cov1.genomicRegions))])
     self.second_overall_coverage = reduce(lambda x,y: np.concatenate((x,y)), [self.cov2.coverage[i] for i in range(len(self.cov2.genomicRegions))])
     assert(len(self.first_overall_coverage) == len(self.second_overall_coverage))
     
     self.scores = np.zeros(len(self.first_overall_coverage))
     self.indices_of_interest = []
예제 #9
0
class DualCoverageSet():
    def _get_BAM_names(self, input, ip):
        #get names of BAM files for bw files
        name_bam = path.splitext(path.basename(ip))[0]
        if input is not None:
            name_input = path.splitext(path.basename(input))[0]
        else:
            name_input = None
        
        return name_bam, name_input
        
    def __init__(self, name, region, genome_path, binsize, stepsize, rmdup, file_1, ext_1, file_2, ext_2,\
                 input_1, ext_input_1, input_factor_1, input_2, ext_input_2, input_factor_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\
                 factor_input_1, factor_input_2, chrom_sizes_dict, debug, tracker):
        self.genomicRegions = region
        self.binsize = binsize
        self.stepsize = stepsize
        self.name = name
        self.cov1 = CoverageSet('first file', region)
        self.cov2 = CoverageSet('second file', region)
        
        print("Loading reads...", file=sys.stderr)
        self.cov1.coverage_from_bam(bam_file=file_1, read_size=ext_1, rmdup=rmdup, binsize=binsize, stepsize=stepsize, mask_file=deadzones)
        self.cov2.coverage_from_bam(bam_file=file_2, read_size=ext_2, rmdup=rmdup, binsize=binsize, stepsize=stepsize, mask_file=deadzones)
        
        map_input = {1: {'input': input_1, 'input_factor': input_factor_1, 'ext': ext_input_1, 'cov-ip': self.cov1, 'ip': file_1}, 
                     2: {'input': input_2, 'input_factor': input_factor_2, 'ext': ext_input_2, 'cov-ip': self.cov2, 'ip': file_2}}
        
        if not no_gc_content and input_1 is not None and input_2 is not None:
            print("Computing GC content", file=sys.stderr)
        else:
            print("Do not compute GC content", file=sys.stderr)
        
        for i in [1, 2]:
            input = map_input[i]
            name_bam, name_input = self._get_BAM_names(input['input'], input['ip'])
            
            if debug: #0: output raw IP
                input['cov-ip'].write_bigwig(name + '-debug-0-' + name_bam + '.bw', chrom_sizes)
            
            if input['input'] is not None:
                input['cov-input'] = CoverageSet('%s file' %input['input'], region)
                input['cov-input'].coverage_from_bam(bam_file=input['input'], read_size=input['ext'], rmdup=rmdup, binsize=binsize, stepsize=stepsize)
                map_input[i]['cov-input'] = input['cov-input']
                
            
            if not no_gc_content and input['input'] is not None:
                gc_content_cov, avg_gc_content, gc_hist = get_gc_context(stepsize, binsize, genome_path, input['cov-input'].coverage, chrom_sizes_dict)
                
                self._norm_gc_content(input['cov-ip'].coverage, gc_content_cov, avg_gc_content)
                self._norm_gc_content(input['cov-input'].coverage, gc_content_cov, avg_gc_content)
                
                if debug: #1: output after GC
                    self.print_gc_hist(name + '-' + name_input, gc_hist) #print hist data
                    input['cov-input'].write_bigwig(name + '-debug-1-' + name_input + '.bw', chrom_sizes)
                    input['cov-ip'].write_bigwig(name + '-debug-1-' + name_bam + '.bw', chrom_sizes)
        
        
        norm_done = False
        print("Normalizing signals", file=sys.stderr)
        for i in [1, 2]:
            input = map_input[i]
            name_bam, name_input = self._get_BAM_names(input['input'], input['ip'])
            
            #TODO: uncomment here!
            norm_done = self.normalization(map_input, i, norm_strategy, norm_done, name, debug, factor_input_1, factor_input_2, chrom_sizes_dict, tracker)
            
            if input['input'] is not None:
                input['cov-input'].write_bigwig(name + '-' + name_input + '-normalized.bw', chrom_sizes)
            input['cov-ip'].write_bigwig(name + '-' + name_bam + '-normalized.bw', chrom_sizes)
                
        #make one array for the coverage
        self.first_overall_coverage = reduce(lambda x,y: np.concatenate((x,y)), [self.cov1.coverage[i] for i in range(len(self.cov1.genomicRegions))])
        self.second_overall_coverage = reduce(lambda x,y: np.concatenate((x,y)), [self.cov2.coverage[i] for i in range(len(self.cov2.genomicRegions))])
        assert(len(self.first_overall_coverage) == len(self.second_overall_coverage))
        
        self.scores = np.zeros(len(self.first_overall_coverage))
        self.indices_of_interest = []
    
    
    def normalization(self, map_input, i, norm_strategy, norm_done, name, debug, factor_input_1, factor_input_2, chrom_sizes_dict, tracker):
        input = map_input[i]
        
        #compute normalization factor
        #pre-defined values
        if input['input_factor'] is not None and i != 1:
            print("Normalize by Diaz and pre-defined values...", input['input_factor'], file=sys.stderr)
            print("Normalize file 1 with input normalization factor %s" %(map_input[1]['input_factor']), file=sys.stderr)
            print("Normalize file 2 with input normalization factor %s" %(map_input[2]['input_factor']), file=sys.stderr)
            tracker.write(text=str(map_input[1]['input_factor']) + ',' + str(map_input[2]['input_factor']), header="Predefined Normalization factor of Input")
            
            map_input[1]['cov-input'].scale(map_input[1]['input_factor'])
            map_input[2]['cov-input'].scale(map_input[2]['input_factor'])
            map_input[1]['cov-ip'].subtract(map_input[1]['cov-input'])
            map_input[2]['cov-ip'].subtract(map_input[2]['cov-input'])

        #naive norm.
        if not norm_done and norm_strategy == 1: 
            if factor_input_1 is None or factor_input_2 is None:
                s1 = sum([sum(map_input[1]['cov-ip'].coverage[i]) for i in range(len(map_input[1]['cov-ip'].genomicRegions))])
                s2 = sum([sum(map_input[2]['cov-ip'].coverage[i]) for i in range(len(map_input[2]['cov-ip'].genomicRegions))])
                if s1 > s2:
                    map_input[2]['cov-ip'].scale(s1/float(s2))
                    print("Normalize file 2 by signal with estimated factor %s " %(round(s1/float(s2),3)), file=sys.stderr)
                    tracker.write(text=str(round(s1/float(s2), 3)), header="Normalization factor of signal 2")
                elif s2 >= s1:
                    print("Normalize file 1 by signal with estimated factor %s " %(round(s2/float(s1),3)), file=sys.stderr)
                    tracker.write(text=str(round(s2/float(s1), 3)), header="Normalization factor of signal 1")
                    map_input[1]['cov-ip'].scale(s2/float(s1))
    
                norm_done = True
            else:
                map_input[1]['cov-ip'].scale(factor_input_1)
                print("Normalize file 1 by signal with given factor %s " %round(factor_input_1, 3), file=sys.stderr)
                tracker.write(text=str(round(factor_input_1, 3)), header="Predefined Normalization factor of signal 1")
                    
                map_input[2]['cov-ip'].scale(factor_input_2)
                print("Normalize file 2 by signal with given factor %s " %round(factor_input_2, 3), file=sys.stderr)
                tracker.write(text=str(round(factor_input_2, 3)), header="Predefined Normalization factor of signal 2")
                norm_done = True
        
        #diaz and naive
        if i != 1 and norm_strategy == 5:
            #apply diaz
            _, map_input[1]['input_factor'] = get_normalization_factor(map_input[1]['ip'], map_input[1]['input'], step_width=1000, zero_counts=0, \
                                                              filename=name + '-norm' + str(i), debug=debug, chrom_sizes_dict=chrom_sizes_dict, two_sample=False)
            _, map_input[2]['input_factor'] = get_normalization_factor(map_input[2]['ip'], map_input[2]['input'], step_width=1000, zero_counts=0, \
                                                              filename=name + '-norm' + str(i), debug=debug, chrom_sizes_dict=chrom_sizes_dict, two_sample=False)
            
            print("Normalize input with factor %s and %s" %(round(map_input[1]['input_factor'], 3), round(map_input[2]['input_factor'], 3)), file=sys.stderr)
            tracker.write(text=str(round(map_input[1]['input_factor'], 3)) + ',' + str(round(map_input[2]['input_factor'], 3)), header="Input Normalization factors")
                
            
            map_input[1]['cov-input'].scale(map_input[1]['input_factor'])
            map_input[2]['cov-input'].scale(map_input[2]['input_factor'])
            
            map_input[1]['cov-ip'].subtract(map_input[1]['cov-input'])
            map_input[2]['cov-ip'].subtract(map_input[2]['cov-input'])
            
            if factor_input_1 is None or factor_input_2 is None:
                #apply naive method
                s1 = sum([sum(map_input[1]['cov-ip'].coverage[i]) for i in range(len(map_input[1]['cov-ip'].genomicRegions))])
                s2 = sum([sum(map_input[2]['cov-ip'].coverage[i]) for i in range(len(map_input[2]['cov-ip'].genomicRegions))])
                
                if s1 > s2:
                    map_input[2]['cov-ip'].scale(s1/float(s2))
                    print("Normalize file 2 by signal with estimated factor %s " %(round(s1/float(s2), 3)), file=sys.stderr)
                    tracker.write(text=str(round(s1/float(s2), 3)), header="Normalization factor of signal 2")
                elif s2 >= s1:
                    print("Normalize file 1 by signal with estimated factor %s " %(round(s2/float(s1), 3)), file=sys.stderr)
                    map_input[1]['cov-ip'].scale(s2/float(s1))
                    tracker.write(text=str(round(s2/float(s1), 3)), header="Normalization factor of signal 1")
            else:
                map_input[1]['cov-ip'].scale(factor_input_1)
                print("Normalize file 1 by signal with given factor %s " %round(factor_input_1, 3), file=sys.stderr)
                tracker.write(text=str(round(factor_input_1, 3)), header="Normalization factor of signal 1")
                map_input[2]['cov-ip'].scale(factor_input_2)
                print("Normalize file 2 by signal with given factor %s " %round(factor_input_2, 3), file=sys.stderr)
                tracker.write(text=str(round(factor_input_2, 3)), header="Normalization factor of signal 2")
        return norm_done

    
    def print_gc_hist(self, name, gc_hist):
        f = open(name + '-gc-content.data', 'w')
        for i in range(len(gc_hist)):
            print(i, gc_hist[i], file=f)
        f.close()
    
    def _norm_gc_content(self, cov, gc_cov, gc_avg):
        for i in range(len(cov)):
            assert len(cov[i]) == len(gc_cov[i])
#            cov[i] = gc_cov[i]
            cov[i] = np.array(cov[i])
            gc_cov[i] = np.array(gc_cov[i])
            gc_cov[i][gc_cov[i] < EPSILON] = gc_avg #sometimes zeros occur, do not consider
            cov[i] = cov[i] * gc_avg / gc_cov[i]
            cov[i] = cov[i].clip(0, max(max(cov[i]), 0)) #neg. values to 0
            cov[i] = cov[i].astype(int)
        
    def _index2coordinates(self, index):
        """Translate index within coverage array to genomic coordinates."""
        iter = self.genomicRegions.__iter__()
        r = iter.next()
        sum = r.final
        last = 0
        i = 0
        while sum <= index * self.stepsize:
            last += len(self.cov1.coverage[i])
            try:
                r = iter.next()
            except StopIteration:
                sum += r.final
                i += 1
                break
            sum += r.final
            i += 1
        
        return r.chrom, (index-last) * self.stepsize, \
            min((index-last) * self.stepsize + self.stepsize, r.final)
                              
    def __len__(self):
        """Return number of observations."""
        return len(self.indices_of_interest)
    
    def get_observation(self, mask=np.array([])):
        """Return indices of observations. Do not consider indices contained in <mask> array"""
        if not mask.size:
            mask = np.array([True]*len(self.first_overall_coverage))
        return np.array([self.first_overall_coverage[mask], self.second_overall_coverage[mask]]).T
    
    def _compute_score(self):
        """Compute score for each observation (based on Xu et al.)"""
        self.scores = self.first_overall_coverage / float(sum(self.first_overall_coverage)) + \
                        self.second_overall_coverage / float(sum(self.second_overall_coverage))
                        
    def compute_putative_region_index(self, l=5):
        """Compute putative differential peak regions as follows: 
        - score must be > 2/(m*n) (m=#obs, n=0.9 (default) )
        - overall coverage in library 1 and 2 must be > 3
        - extend resulting sites by l steps in both directions. """
        m = len(self.first_overall_coverage)
        n = 0.9
        self._compute_score()
#        print('before filter step:', len(self.scores), file=sys.stderr)
        #print(self.first_overall_coverage, self.second_overall_coverage, file=sys.stderr)
        self.indices_of_interest = np.where(self.scores > 2/(m*n))[0]
#        print('after first filter step: ', len(self.indices_of_interest), file=sys.stderr)
        tmp = np.where(self.first_overall_coverage + self.second_overall_coverage > 3)[0]
        tmp2 = np.intersect1d(self.indices_of_interest, tmp)
#        print('length of intersection set: ', len(tmp), file=sys.stderr)
        self.indices_of_interest = tmp2
#        print('after second filter step: ', len(self.indices_of_interest), file=sys.stderr)
        #extend regions by l steps
        tmp = set()
        for i in self.indices_of_interest:
            for j in range(max(0, i-l), i+l+1):
                tmp.add(j)
        tmp = list(tmp)
        tmp.sort()
        self.indices_of_interest = np.array(tmp)

    def get_initial_dist(self, filename):
        """Write BED file with initial state distribution"""
        states = []
        threshold = 2.0
        for i in self.indices_of_interest:
            c1 = self.first_overall_coverage[i]
            c2 = self.second_overall_coverage[i]
            
            if c1 + c2 <= 3:
                state = 0
            elif c1 / max(float(c2), 1) > threshold or c1-c2>10:
                state = 1
            elif c1 / max(float(c2), 1) < 1/threshold or c2-c1>10:
                state = 2
            else:
                state = 0
            
            states.append(state)
        
        f = open(filename, 'w')
        for j in range(len(states)):
            i = self.indices_of_interest[j]
            chrom, start, end = self._index2coordinates(i)
            s = states[j]
            print(chrom, start, end, s, self.first_overall_coverage[i], self.second_overall_coverage[i], sep='\t', file=f)
            
        f.close()
   
    def write_putative_regions(self, path):
        """Write putative regions (defined by criteria mentioned in method) as BED file."""
        with open(path, 'w') as f:
            for i in self.indices_of_interest:
                chrom, start, end = self._index2coordinates(i)
                print(chrom, start, end, file=f)
            
    def get_training_set(self, exp_data, x, verbose, name, debug, constraint_chrom):
        """Return linked genomic positions (at least <x> positions) to train HMM.
        Grep randomly a position within a putative region, and take then the entire region."""
        training_set = set()
        ts1 = set()
        ts2 = set()
        threshold = 2.0
        diff_cov = 10
        
        if constraint_chrom is not None:
            print("HMM training set based on %s" %constraint_chrom, file=sys.stderr)
        
        for i in range(len(self.indices_of_interest)):
            chrom, start, end = self._index2coordinates(i)
            if constraint_chrom is not None and chrom != constraint_chrom:
                continue
            cov1 = exp_data.first_overall_coverage[self.indices_of_interest[i]]
            cov2 = exp_data.second_overall_coverage[self.indices_of_interest[i]]
            
            if cov1 / max(float(cov2), 1) > threshold or cov1-cov2 > diff_cov:
                ts1.add(i)
            if cov1 / max(float(cov2), 1) < 1/threshold or cov2-cov1 > diff_cov:
                ts2.add(i)
        
        l = min(min(len(ts1), len(ts2)), x)
        tmp = set(sample(ts1, l)) | set(sample(ts2, l))
        
        for i in tmp:
            training_set.add(self.indices_of_interest[i])
            #search up
            while i+1 < len(self.indices_of_interest) and self.indices_of_interest[i+1] == self.indices_of_interest[i]+1:
                training_set.add(self.indices_of_interest[i+1])
                i += 1
            #search down
            while i-1 > 0 and self.indices_of_interest[i-1] == self.indices_of_interest[i]-1:
                training_set.add(self.indices_of_interest[i-1])
                i -= 1
        
        
#        while len(training_set) < x:
#            i = randrange(1, len(self.indices_of_interest)-1)
#            
#            if i in used:
#                continue #todo: super ugly
#            used.add(i)
#            training_set.add(self.indices_of_interest[i])
#            #search up
#            while i+1 < len(self.indices_of_interest) and self.indices_of_interest[i+1] == self.indices_of_interest[i]+1:
#                training_set.add(self.indices_of_interest[i+1])
#                i += 1
#            #search down
#            while i-1 > 0 and self.indices_of_interest[i-1] == self.indices_of_interest[i]-1:
#                training_set.add(self.indices_of_interest[i-1])
#                i -= 1
        
        training_set = list(training_set)
        training_set.sort()
        if debug:
            f=open(name + '-trainingset.bed', 'w')
            for l in training_set:
                chrom, s, e = self._index2coordinates(l)
                print(chrom, s, e, sep ='\t', file=f)
            f.close()
            
        return np.array(training_set)
예제 #10
0
import unittest
from rgt.GenomicRegionSet import *
from rgt.CoverageSet import CoverageSet

regions = GenomicRegionSet("test")
regions.add(GenomicRegion("chr1", 10000, 11000, "+"))
regions.add(GenomicRegion("chr1", 20000, 21000, "-"))

cov = CoverageSet("coverage", regions)

bamfile = "/projects/lncRNA/local/cardio/total_rna/bam/d4_1.bam"
bedfile = "~/rgtdata/hg38/genes_hg38.bed"

class CoverageSet_Test(unittest.TestCase):
    def coverage_from_genomicset(self):
        cov.coverage_from_genomicset(bamfile)
        print(cov.coverage)
        self.assertEqual(cov.coverage, 4)
예제 #11
0
def call_peaks(bam, csizes, pval, min_reads, binsize, cfile=None):
    '''
    Call peaks on bam file using pvalue and binomial model.
    Returns GenomeRegionSet with peaks, and CoverageSet with signal.
    '''

    # make chromsizes region set
    rs = get_chrom_sizes_as_genomicregionset(csizes)

    print("calculating extension sizes...")
    # calculate ext size
    ext, _ = get_extension_size(bam, start=0, end=300, stepsize=5)

    print("calculating coverage...")
    # calc coverage
    cov = CoverageSet('coverageset', rs)
    cov.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True)

    # calculate cov2 for output bw
    cov2 = CoverageSet('coverageset2', rs)
    cov2.coverage_from_bam(bam_file=bam,
                           extension_size=ext,
                           paired_reads=True,
                           binsize=binsize,
                           stepsize=binsize // 2)

    if cfile is not None:
        print(f"Using control file: {cfile}")
        control = CoverageSet('contorl', rs)
        control.coverage_from_bam(bam_file=cfile, extension_size=ext)
        with np.errstate(divide='ignore', invalid='ignore'):
            norm_igg(cov, control)

        # recalc overall coverage
        cov.overall_cov = reduce(lambda x, y: np.concatenate(
            (x, y)), [cov.coverage[i] for i in range(len(cov.genomicRegions))])

    # total coverage
    s = np.sum(cov.overall_cov)
    # probability of event, a read in a bin, (avg reads/bin )/libsize
    p = np.mean(cov.overall_cov[cov.overall_cov > 0]) / s

    # what is the max coverage
    maxcov = np.max(cov.overall_cov)

    # create dict with probability for each count value
    mc = np.arange(0, maxcov + 1, dtype="object")
    d = {count: binom_test((count, s - count), p=p) for count in mc}

    # create GenomicRegionSet to hold peaks
    res = GenomicRegionSet('identified_peaks')

    print("calculating peaks...")
    # iterate through bins in genome, store peaks
    for i, c in enumerate(cov.overall_cov):
        if filter_bins(c, d, min_reads):
            chrom, s, e = cov.index2coordinates(i, rs)
            res.add(GenomicRegion(chrom, s, e + 1, data=d[c]))

    # merge ol peaks
    res.merge()

    # merge peaks within ext dist
    rc = res.cluster(ext)

    return rc, cov, cov2