Пример #1
0
    def __init__(self, chrom_sizes, regions=None):
        self.counter = 0
        if regions is not None:
            print("Call DPs on specified regions.", file=sys.stderr)
            with open(regions) as f:
                for line in f:
                    if line:
                        line = line.strip()
                        line = line.split()
                        c, s, e = line[0], int(line[1]), int(line[2])
                        #if c in contained_chrom:
                        self.regionset.add(
                            GenomicRegion(chrom=c, initial=s, final=e))
                        self.chrom_sizes_dict[c] = e
        else:
            print("Call DPs on whole genome.", file=sys.stderr)
            with open(chrom_sizes) as f:
                for line in f:
                    line = line.strip()
                    line = line.split('\t')
                    chrom, end = line[0], int(line[1])
                    #if chrom in contained_chrom:
                    self.regionset.add(
                        GenomicRegion(chrom=chrom, initial=0, final=end))
                    self.chrom_sizes_dict[chrom] = end

        if not self.regionset.sequences:
            print('something wrong here', file=sys.stderr)
            sys.exit(2)
Пример #2
0
 def __init__(self, chrom, initial, final, name=None, score=None, errors_bp=None, motif=None, 
              strand=None, orientation=None, guanine_rate=None, seq=None):
     """*Keyword arguments:*
     
         - name -- The name of this binding site (Default: None)
         - seq_type -- DNA or RNA
         - chrm -- Define the chromosome for DNA; for RNA its default is "RNA"
         - initial -- Binding start position
         - final -- Binding end position
         - score -- Score of the binding pattern (Default: None)
         - errors_bp -- Error base pair in this binding (Default: None)
         - motif -- The motif for this binding (Default: None)
         - strand -- The strand of DNA (+ or -) (Default: None)
         - orientation -- Parallel or antiparallel (Default: None)
         - guanine_rate -- (Default: None)
         - seq -- Sequence of this region with ATCG as letters
     """
     GenomicRegion.__init__(self, chrom=chrom, initial=initial, final=final)
     
     self.name = name                      # RNA name
     self.score = score                    # Score for pattern matching
     self.errors_bp = errors_bp                  
     self.motif = motif
     #self.strand = strand
     self.orientation = orientation
     self.seq = seq                        # An object (Sequence) not just a string
     if seq:
         self.guanine_rate = "{0:.2f}".format(float(seq.seq.count("G"))/len(seq))
Пример #3
0
    def __init__(self,
                 chrom,
                 pos,
                 ref,
                 alt,
                 qual,
                 filter=None,
                 id=None,
                 info=None,
                 format=None,
                 genotype=None,
                 samples=None):
        GenomicRegion.__init__(self, chrom, pos, pos + 1)

        self.chrom = str(chrom)
        self.pos = int(pos)
        self.id = id
        self.ref = ref
        self.alt = alt
        self.qual = qual
        self.filter = filter
        self.info = info
        self.format = format
        self.genotype = genotype
        self.samples = samples

        self.name = self.__str__
        self.data = "_$_".join(
            map(lambda x: str(x), [
                self.id, self.ref, self.alt, self.qual, self.filter, self.info,
                self.format, self.genotype, self.samples
            ]))
Пример #4
0
    def __init__(self, chrom, initial, final, name=None, score=None, errors_bp=None, motif=None, 
                 strand=None, orientation=None, guanine_rate=None, seq=None):
        """Initialize
        
        name             The name of this binding site (Default: None)
        seq_type         DNA or RNA
        chrm             Define the chromosome for DNA; for RNA its default is "RNA"
        initial          Binding start position
        final            Binding end position
        score            Score of the binding pattern (Default: None)
        errors_bp        Error base pair in this binding (Default: None)
        motif            The motif for this binding (Default: None)
        strand           The strand of DNA (+ or -) (Default: None)
        orientation      Parallel or antiparallel (Default: None)
        guanine_rate     (Default: None)
        seq              Sequence of this region with ATCG as letters

        """
        GenomicRegion.__init__(self, chrom=chrom, initial=initial, final=final)
        
        self.name = name                      # RNA name
        self.score = score                    # Score for pattern matching
        self.errors_bp = errors_bp                  
        self.motif = motif
        #self.strand = strand
        self.orientation = orientation
        self.seq = seq                        # An object (Sequence) not just a string
        if seq:
            self.guanine_rate = "{0:.2f}".format(float(seq.seq.count("G"))/len(seq))
Пример #5
0
    def test_match_multiple(self):
        dirname = os.path.dirname(__file__)
        jasp_dir = "../../data/motifs/jaspar_vertebrates/"

        scanner = scan.Scanner(7)

        pssm_list = []
        thresholds = []

        motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1, 0.0001, None)

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold_rc)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
Пример #6
0
    def test_match_multiple(self):

        ms = MotifSet(preload_motifs="default")
        ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact")

        self.assertEqual(len(ms), 1)

        motif = ms.get_motif_list(1, 0.0001)[0]

        scanner = scan.Scanner(7)

        pssm_list, thresholds = [], []

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
Пример #7
0
 def __init__(self, chrom, pos, ref, alt, qual, filter = None, id = None, info = None, format = None, genotype = None, samples = None):
     GenomicRegion.__init__(self, chrom, pos, pos + 1)
     
     self.chrom = str(chrom)
     self.pos = int(pos)
     self.id = id
     self.ref = ref
     self.alt = alt
     self.qual = qual
     self.filter = filter 
     self.info = info
     self.format = format
     self.genotype = genotype
     self.samples = samples
     
     self.name = self.__str__
     self.data = "_$_".join(map(lambda x: str(x), [self.id, self.ref, self.alt, self.qual, self.filter, self.info, self.format, self.genotype, self.samples]))
Пример #8
0
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \
               inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \
               tracker, debug, norm_regions, scaling_factors_ip, save_wig):
    """Initialize the MultiCoverageSet"""

    regionset = GenomicRegionSet(name)
    chrom_sizes_dict = {}
    #if regions option is set, take the values, otherwise the whole set of 
    #chromosomes as region to search for DPs
    if regions is not None:
        print("Call DPs on specified regions.", file=sys.stderr)
        with open(regions) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                c, s, e = line[0], int(line[1]), int(line[2])
                regionset.add(GenomicRegion(chrom=c, initial=s, final=e))
                chrom_sizes_dict[c] = e
    else:
        print("Call DPs on whole genome.", file=sys.stderr)
        with open(chrom_sizes) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                chrom, end = line[0], int(line[1])
                regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end))
                chrom_sizes_dict[chrom] = end
    
    if norm_regions:
        norm_regionset = GenomicRegionSet('norm_regions')
        norm_regionset.read_bed(norm_regions)
    else:
        norm_regionset = None
    
    regionset.sequences.sort()
    exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose)
    tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)")
    
    multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\
                                  path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \
                                  chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \
                                  norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig)
    
    return multi_cov_set
Пример #9
0
def intersect(gnrsA, gnrsB, overlap_type):
    # Convert to ctypes
    lenA = len(gnrsA)
    lenB = len(gnrsB)
    lenR = min(lenA, lenB)

    chromsA_python = [gr.chrom for gr in gnrsA.sequences]
    chromsA_c = (c_char_p * lenA)(*chromsA_python)

    chromsB_python = [gr.chrom for gr in gnrsB.sequences]
    chromsB_c = (c_char_p * lenB)(*chromsB_python)

    initialsA_python = [gr.initial for gr in gnrsA.sequences]
    initialsA_c = (c_int * lenA)(*initialsA_python)

    initialsB_python = [gr.initial for gr in gnrsB.sequences]
    initialsB_c = (c_int * lenB)(*initialsB_python)

    finalsA_python = [gr.final for gr in gnrsA.sequences]
    finalsA_c = (c_int * lenA)(*finalsA_python)

    finalsB_python = [gr.final for gr in gnrsB.sequences]
    finalsB_c = (c_int * lenB)(*finalsB_python)

    indices_c = POINTER(c_int)((c_int * lenR)())
    initialsR_c = POINTER(c_int)((c_int * lenR)())
    finalsR_c = POINTER(c_int)((c_int * lenR)())
    sizeR_c = c_int()

    # Call C-function
    if overlap_type == 0:
        intersect_overlap_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c,
                            initialsB_c, finalsB_c, lenB, pointer(indices_c),
                            pointer(initialsR_c), pointer(finalsR_c),
                            byref(sizeR_c))
    elif overlap_type == 1:
        intersect_original_c(chromsA_c, initialsA_c, finalsA_c, lenA,
                             chromsB_c, initialsB_c, finalsB_c, lenB,
                             pointer(indices_c), pointer(initialsR_c),
                             pointer(finalsR_c), byref(sizeR_c))
    elif overlap_type == 2:
        intersect_completely_included_c(chromsA_c, initialsA_c, finalsA_c,
                                        lenA, chromsB_c, initialsB_c,
                                        finalsB_c, lenB, pointer(indices_c),
                                        pointer(initialsR_c),
                                        pointer(finalsR_c), byref(sizeR_c))

    result = GenomicRegionSet(gnrsA.name)
    for i in range(sizeR_c.value):
        result.add(
            GenomicRegion(chromsA_python[indices_c[i]], initialsR_c[i],
                          finalsR_c[i]))

    return result
Пример #10
0
    def __init__(self,
                 chrom,
                 initial,
                 final,
                 name=None,
                 score=None,
                 errors_bp=None,
                 motif=None,
                 strand=None,
                 orientation=None,
                 guanine_rate=None,
                 seq=None):
        """*Keyword arguments:*
        
            - name -- The name of this binding site (Default: None)
            - seq_type -- DNA or RNA
            - chrm -- Define the chromosome for DNA; for RNA its default is "RNA"
            - initial -- Binding start position
            - final -- Binding end position
            - score -- Score of the binding pattern (Default: None)
            - errors_bp -- Error base pair in this binding (Default: None)
            - motif -- The motif for this binding (Default: None)
            - strand -- The strand of DNA (+ or -) (Default: None)
            - orientation -- Parallel or antiparallel (Default: None)
            - guanine_rate -- (Default: None)
            - seq -- Sequence of this region with ATCG as letters
        """
        GenomicRegion.__init__(self, chrom=chrom, initial=initial, final=final)

        self.name = name  # RNA name
        self.score = score  # Score for pattern matching
        self.errors_bp = errors_bp
        self.motif = motif
        #self.strand = strand
        self.orientation = orientation
        self.seq = seq  # An object (Sequence) not just a string
        if seq:
            self.guanine_rate = "{0:.2f}".format(
                float(seq.seq.count("G")) / len(seq))
Пример #11
0
def merge_delete(ext_size, merge, peak_list, pvalue_list):
    #     peaks_gain = read_diffpeaks(path)

    regions_plus = GenomicRegionSet('regions')  #pot. mergeable
    regions_minus = GenomicRegionSet('regions')  #pot. mergeable
    regions_unmergable = GenomicRegionSet('regions')
    last_orientation = ""

    for i, t in enumerate(peak_list):
        chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[
            4], t[5], t[6]
        r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \
                          orientation = strand, data = str((c1, c2, pvalue_list[i], ratio)))
        if end - start > ext_size:
            if strand == '+':
                if last_orientation == '+':
                    region_plus.add(r)
                else:
                    regions_unmergable.add(r)
            elif strand == '-':
                if last_orientation == '-':
                    region_mins.add(r)
                else:
                    regions_unmergable.add(r)

    if merge:
        regions_plus.extend(ext_size / 2, ext_size / 2)
        regions_plus.merge()
        regions_plus.extend(-ext_size / 2, -ext_size / 2)
        merge_data(regions_plus)

        regions_minus.extend(ext_size / 2, ext_size / 2)
        regions_minus.merge()
        regions_minus.extend(-ext_size / 2, -ext_size / 2)
        merge_data(regions_minus)

    results = GenomicRegionSet('regions')
    for el in regions_plus:
        results.add(el)
    for el in regions_minus:
        results.add(el)
    for el in regions_unmergable:
        results.add(el)
    results.sort()

    return results
Пример #12
0
def rna_associated_gene(rna_regions, name, organism):
    if rna_regions:
        s = [ rna_regions[0][0], min([e[1] for e in rna_regions]), 
              max([e[2] for e in rna_regions]), rna_regions[0][3] ]
        g = GenomicRegionSet("RNA associated genes")
        g.add( GenomicRegion(chrom=s[0], initial=s[1], final=s[2], name=name, orientation=s[3]) )
        asso_genes = g.gene_association(organism=organism, promoterLength=1000, show_dis=True)

        genes = asso_genes[0].name.split(":")
        closest_genes = []
        for n in genes:
            if name not in n: closest_genes.append(n)
        closest_genes = set(closest_genes)

        if len(closest_genes) == 0:
            return "."
        else:
            return ":".join(closest_genes)
    else:
        return "."
Пример #13
0
    def test_cmp(self):
        r = GenomicRegion(chrom=1, initial=10, final=20)

        r2 = GenomicRegion(chrom=1, initial=12, final=22)
        self.assertTrue(r < r2)

        r2 = GenomicRegion(chrom=1, initial=8, final=18)
        self.assertTrue(r > r2)

        r2 = GenomicRegion(chrom=1, initial=10, final=12)
        self.assertTrue(r > r2)

        r2 = GenomicRegion(chrom=1, initial=12, final=14)
        self.assertTrue(r < r2)

        r2 = GenomicRegion(chrom='X', initial=4, final=8)
        self.assertTrue(r < r2)

        r2 = GenomicRegion(chrom=1, initial=10, final=18)
        self.assertTrue(r >= r2)
Пример #14
0
 def test_len(self):
     r = GenomicRegion(chrom=1, initial=10, final=20)
     self.assertEqual(len(r), 10)
Пример #15
0
    def test_overlap(self):
        r = GenomicRegion(chrom=1, initial=10, final=15)

        # usual cases
        r2 = GenomicRegion(chrom=1, initial=20, final=25)
        self.assertFalse(r.overlap(r2))

        r2 = GenomicRegion(chrom=1, initial=0, final=5)
        self.assertFalse(r.overlap(r2))

        r2 = GenomicRegion(chrom=1, initial=7, final=12)
        self.assertTrue(r.overlap(r2))

        r2 = GenomicRegion(chrom=1, initial=12, final=18)
        self.assertTrue(r.overlap(r2))

        r2 = GenomicRegion(chrom=1, initial=12, final=14)
        self.assertTrue(r.overlap(r2))

        # r2 within r
        r2 = GenomicRegion(chrom=1, initial=11, final=13)
        self.assertTrue(r.overlap(r2))

        # border cases
        # GenomicRegions touch, but do not overlap
        r2 = GenomicRegion(chrom=1, initial=5, final=10)
        self.assertFalse(r.overlap(r2))

        # here, they overlap
        r2 = GenomicRegion(chrom=1, initial=5, final=11)
        self.assertTrue(r.overlap(r2))

        # they touch, do not overlap
        r2 = GenomicRegion(chrom=1, initial=15, final=20)
        self.assertFalse(r.overlap(r2))

        # they overlap in 1 bp (14th)
        r2 = GenomicRegion(chrom=1, initial=14, final=20)
        self.assertTrue(r.overlap(r2))

        # they have zero length
        r = GenomicRegion(chrom=1, initial=10, final=10)
        r2 = GenomicRegion(chrom=1, initial=10, final=10)
        self.assertFalse(r.overlap(r2))

        # they have zero length
        r = GenomicRegion(chrom=1, initial=10, final=10)
        r2 = GenomicRegion(chrom=1, initial=11, final=11)
        self.assertFalse(r.overlap(r2))

        # they have zero length
        r = GenomicRegion(chrom=1, initial=10, final=10)
        r2 = GenomicRegion(chrom=1, initial=5, final=10)
        self.assertFalse(r.overlap(r2))
Пример #16
0
    def test_extend(self):
        # normal extend
        r = GenomicRegion(chrom=1, initial=10, final=20)
        r.extend(5, 15)
        self.assertEqual(r.initial, 5)
        self.assertEqual(r.final, 35)

        # use negative values to extend
        r2 = GenomicRegion(chrom=1, initial=10, final=20)
        r2.extend(-5, -1)
        self.assertEqual(r2.initial, 15)
        self.assertEqual(r2.final, 19)

        # extend to under zero
        r3 = GenomicRegion(chrom=1, initial=10, final=20)
        r3.extend(15, 0)
        self.assertEqual(r3.initial, 0)

        # extend so that inital and final coordinate change
        r4 = GenomicRegion(chrom=1, initial=10, final=20)
        r4.extend(-50, -50)
        self.assertEqual(r4.initial, 0)
        self.assertEqual(r4.final, 60)
Пример #17
0
def initialize(name, genome_path, regions, stepsize, binsize, bam_file_1, bam_file_2, ext_1, ext_2, \
               input_1, input_factor_1, ext_input_1, input_2, input_factor_2, ext_input_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\
               factor_input_1, factor_input_2, debug, tracker):

    regionset = GenomicRegionSet(name)
    chrom_sizes_dict = {}
    #if regions option is set, take the values, otherwise the whole set of
    #chromosomes as region to search for DPs
    if regions is not None:
        with open(regions) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                c, s, e = line[0], int(line[1]), int(line[2])
                regionset.add(GenomicRegion(chrom=c, initial=s, final=e))
                chrom_sizes_dict[c] = e
    else:
        with open(chrom_sizes) as f:
            for line in f:
                line = line.strip()
                line = line.split('\t')
                chrom, end = line[0], int(line[1])
                regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end))
                chrom_sizes_dict[chrom] = end

    regionset.sequences.sort()

    start = 0
    end = 600
    ext_stepsize = 5
    #TODO: maybe for-loops?
    #compute extension size
    if [ext_1, ext_2, ext_input_1, ext_input_2].count(None) > 0:
        print("Computing read extension sizes...", file=sys.stderr)
    if ext_1 is None:
        ext_1, values_1 = get_extension_size(bam_file_1,
                                             start=start,
                                             end=end,
                                             stepsize=ext_stepsize)
    print("Read extension for first file: %s" % ext_1, file=sys.stderr)

    if ext_2 is None:
        ext_2, values_2 = get_extension_size(bam_file_2,
                                             start=start,
                                             end=end,
                                             stepsize=ext_stepsize)
    print("Read extension for second file: %s" % ext_2, file=sys.stderr)

    if input_1 is not None and ext_input_1 is None:
        ext_input_1, values_input_1 = get_extension_size(input_1,
                                                         start=start,
                                                         end=end,
                                                         stepsize=ext_stepsize)
    print("Read extension for first input file: %s" % ext_input_1,
          file=sys.stderr)

    if input_1 is not None and input_2 is not None and input_1 == input_2 and 'ext_input_1' in locals(
    ) and 'values_input_1' in locals():
        ext_input_2, values_input_2 = ext_input_1, values_input_1
    elif input_2 is not None and ext_input_2 is None:
        ext_input_2, values_input_2 = get_extension_size(input_2,
                                                         start=start,
                                                         end=end,
                                                         stepsize=ext_stepsize)
    print("Read extension for second input file: %s" % ext_input_2,
          file=sys.stderr)

    tracker.write(text=str(ext_1) + "," + str(ext_2),
                  header="Extension size IP1, IP2")
    if input_1 is not None and input_2 is not None:
        tracker.write(text=str(ext_input_1) + "," + str(ext_input_2),
                      header="Extension size Control1, Control2")

    if verbose:
        if 'values_1' in locals() and values_1 is not None:
            with open(name + '-read-ext-1', 'w') as f:
                for v, i in values_1:
                    print(i, v, sep='\t', file=f)

        if 'values_2' in locals() and values_2 is not None:
            with open(name + '-read-ext-2', 'w') as f:
                for v, i in values_2:
                    print(i, v, sep='\t', file=f)

        if 'values_input_1' in locals() and values_input_1 is not None:
            with open(name + '-read-ext-input-1', 'w') as f:
                for v, i in values_input_1:
                    print(i, v, sep='\t', file=f)

        if 'values_input_2' in locals() and values_input_2 is not None:
            with open(name + '-read-ext-input-2', 'w') as f:
                for v, i in values_input_2:
                    print(i, v, sep='\t', file=f)

    cov_cdp_mpp = DualCoverageSet(name=name, region=regionset, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\
                                  file_1=bam_file_1, ext_1=ext_1,\
                                  file_2=bam_file_2, ext_2=ext_2, \
                                  input_1=input_1, ext_input_1=ext_input_1, input_factor_1=input_factor_1, \
                                  input_2=input_2, ext_input_2=ext_input_2, input_factor_2=input_factor_2, \
                                  chrom_sizes=chrom_sizes, verbose=verbose, norm_strategy=norm_strategy, no_gc_content=no_gc_content, deadzones=deadzones,\
                                  factor_input_1=factor_input_1, factor_input_2=factor_input_2, chrom_sizes_dict=chrom_sizes_dict, debug=debug, tracker=tracker)

    return cov_cdp_mpp, [ext_1, ext_2]
Пример #18
0
 def test_extend(self):
     #normal extend
     r = GenomicRegion(chrom=1, initial=10, final=20)
     r.extend(5,15)
     self.assertEqual(r.initial, 5)
     self.assertEqual(r.final, 35)
     
     #use negative values to extend
     r2 = GenomicRegion(chrom=1, initial=10, final=20)
     r2.extend(-5,-1)
     self.assertEqual(r2.initial, 15)
     self.assertEqual(r2.final, 19)
     
     #extend to under zero
     r3 = GenomicRegion(chrom=1, initial=10, final=20)
     r3.extend(15,0)
     self.assertEqual(r3.initial, 0)
     
     #extend so that inital and final coordinate change
     r4 = GenomicRegion(chrom=1, initial=10, final=20)
     r4.extend(-50,-50)
     self.assertEqual(r4.initial, 0)
     self.assertEqual(r4.final, 60)
Пример #19
0
 def test_overlap(self):
     r = GenomicRegion(chrom=1, initial=10, final=15)
     
     #usual cases
     r2 = GenomicRegion(chrom=1, initial=20, final=25)
     self.assertFalse(r.overlap(r2))
     
     r2 = GenomicRegion(chrom=1, initial=0, final=5)
     self.assertFalse(r.overlap(r2))
     
     r2 = GenomicRegion(chrom=1, initial=7, final=12)
     self.assertTrue(r.overlap(r2))
     
     r2 = GenomicRegion(chrom=1, initial=12, final=18)
     self.assertTrue(r.overlap(r2))
     
     r2 = GenomicRegion(chrom=1, initial=12, final=14)
     self.assertTrue(r.overlap(r2))
     
     #r2 within r
     r2 = GenomicRegion(chrom=1, initial=11, final=13)
     self.assertTrue(r.overlap(r2))
     
     #border cases
     #GenomicRegions touch, but do not overlap
     r2 = GenomicRegion(chrom=1, initial=5, final=10)
     self.assertFalse(r.overlap(r2))
     
     #here, they overlap
     r2 = GenomicRegion(chrom=1, initial=5, final=11)
     self.assertTrue(r.overlap(r2))
     
     #they touch, do not overlap
     r2 = GenomicRegion(chrom=1, initial=15, final=20)
     self.assertFalse(r.overlap(r2))
     
     #they overlap in 1 bp (14th)
     r2 = GenomicRegion(chrom=1, initial=14, final=20)
     self.assertTrue(r.overlap(r2))
     
     #they have zero length
     r = GenomicRegion(chrom=1, initial=10, final=10)
     r2 = GenomicRegion(chrom=1, initial=10, final=10)
     self.assertFalse(r.overlap(r2))
     
     #they have zero length
     r = GenomicRegion(chrom=1, initial=10, final=10)
     r2 = GenomicRegion(chrom=1, initial=11, final=11)
     self.assertFalse(r.overlap(r2))
     
     #they have zero length
     r = GenomicRegion(chrom=1, initial=10, final=10)
     r2 = GenomicRegion(chrom=1, initial=5, final=10)
     self.assertFalse(r.overlap(r2))
Пример #20
0
def dbd_regions(exons, sig_region, rna_name, output,out_file=False, temp=None, fasta=True):
    """Generate the BED file of significant DBD regions and FASTA file of the sequences"""
    if len(sig_region) == 0:
        return
    #print(self.rna_regions)
    if not exons:
        pass
    else:
        dbd = GenomicRegionSet("DBD")
        dbdmap = {}
        if len(exons) == 1:
            print("## Warning: No information of exons in the given RNA sequence, the DBD position may be problematic. ")
        for rbs in sig_region:
            loop = True

            if exons[0][3] == "-":
                while loop:
                    cf = 0
                    for exon in exons:
                        #print(exon)

                        l = abs(exon[2] - exon[1])
                        tail = cf + l

                        if cf <= rbs.initial <=  tail:
                            dbdstart = exon[2] - rbs.initial + cf
                            
                            if rbs.final <= tail: 
                                #print("1")
                                dbdend = exon[2] - rbs.final + cf
                                if dbdstart > dbdend: dbdstart, dbdend = dbdend, dbdstart
                                dbd.add( GenomicRegion(chrom=exons[0][0], 
                                                       initial=dbdstart, final=dbdend, 
                                                       orientation=exons[0][3], 
                                                       name=str(rbs.initial)+"-"+str(rbs.final) ) )
                                dbdmap[str(rbs)] = dbd[-1].toString() + " strand:-"
                                loop = False
                                break
                            elif rbs.final > tail:

                                subtract = l + cf - rbs.initial
                                #print("2")
                                #print("Subtract: "+str(subtract))
                                if dbdstart > exon[1]: dbdstart, exon[1] = exon[1], dbdstart
                                dbd.add( GenomicRegion(chrom=exons[0][0], 
                                                       initial=dbdstart, final=exon[1], 
                                                       orientation=exons[0][3], 
                                                       name=str(rbs.initial)+"-"+str(rbs.initial+subtract)+"_split1" ) )
                        
                        elif rbs.initial < cf and rbs.final <= tail: 
                            #print("3")
                            dbdstart = exon[2]
                            dbdend = exon[2] - rbs.final + rbs.initial + subtract
                            if dbdstart > dbdend: dbdstart, dbdend = dbdend, dbdstart
                            dbd.add( GenomicRegion(chrom=exons[0][0], 
                                                   initial=dbdstart, final=dbdend, 
                                                   orientation=exons[0][3], 
                                                   name=str(cf)+"-"+str(rbs.final)+"_split2" ) )
                            dbdmap[str(rbs)] = dbd[-2].toString() + " & " + dbd[-1].toString() + " strand:-"
                            loop = False
                            break

                        elif rbs.initial > tail:
                            pass

                        cf += l
                        
                    loop = False
            else:

                while loop:
                    cf = 0
                    for exon in exons:
                        #print(exon)
                        l = exon[2] - exon[1]
                        tail = cf + l
                        #print("cf:   " + str(cf))
                        #print("tail: " + str(tail) )
                        if cf <= rbs.initial <=  tail:
                            dbdstart = exon[1] + rbs.initial - cf
                            
                            if rbs.final <= tail: 
                                #print("1")
                                dbdend = exon[1] + rbs.final -cf
                                dbd.add( GenomicRegion(chrom=exons[0][0], 
                                                       initial=dbdstart, final=dbdend, 
                                                       orientation=exons[0][3], 
                                                       name=str(rbs.initial)+"-"+str(rbs.final) ) )
                                dbdmap[str(rbs)] = dbd[-1].toString() + " strand:+"
                                loop = False
                                break
                            elif rbs.final > tail:

                                subtract = l + cf - rbs.initial
                                #print("2")
                                #print("Subtract: "+str(subtract))
                                dbd.add( GenomicRegion(chrom=exons[0][0], 
                                                       initial=dbdstart, final=exon[2], 
                                                       orientation=exons[0][3], 
                                                       name=str(rbs.initial)+"-"+str(rbs.initial+subtract)+"_split1" ) )

                        elif rbs.initial < cf and rbs.final <= tail: 
                            #print("3")
                            dbdstart = exon[1]
                            dbdend = exon[1] + rbs.final - rbs.initial - subtract
                            dbd.add( GenomicRegion(chrom=exons[0][0], 
                                                   initial=dbdstart, final=dbdend, 
                                                   orientation=exons[0][3], 
                                                   name=str(cf)+"-"+str(rbs.final)+"_split2" ) )
                            dbdmap[str(rbs)] = dbd[-2].toString() + " & " + dbd[-1].toString() + " strand:+"
                            loop = False
                            break

                        elif rbs.initial > tail:
                            pass

                        cf += l
                        
                    loop = False
        if not out_file:
            dbd.write_bed(filename=os.path.join(output, "DBD_"+rna_name+".bed"))
        else:
            # print(dbd)
            # print(dbd.sequences[0])
            dbd.write_bed(filename=output)
    # FASTA
    if fasta:
        #print(dbdmap)
        if not out_file:
            seq = pysam.Fastafile(os.path.join(output,"rna_temp.fa"))
            fasta_f = os.path.join(output, "DBD_"+rna_name+".fa")
        else:
            seq = pysam.Fastafile(os.path.join(temp,"rna_temp.fa"))
            fasta_f = output+".fa"

        with open(fasta_f, 'w') as fasta:
            for rbs in sig_region:
                print(">"+ rna_name +":"+str(rbs.initial)+"-"+str(rbs.final), file=fasta)
                s = seq.fetch(rbs.chrom, max(0, rbs.initial), rbs.final)
                for ss in [s[i:i + 80] for i in range(0, len(s), 80)]:
                    print(ss, file=fasta)
Пример #21
0
    ints = [gr.initial for gr in gnrsB.sequences]
    initialsB = (c_int * len(ints))(*ints)

    ints = [gr.final for gr in gnrsA.sequences]
    finalsA = (c_int * len(ints))(*ints)

    ints = [gr.final for gr in gnrsB.sequences]
    finalsB = (c_int * len(ints))(*ints)

    # Call C-function
    return jaccardC(chromsA, initialsA, finalsA, len(gnrsA), chromsB,
                    initialsB, finalsB, len(gnrsB))


set1 = GenomicRegionSet("A")
set1.add(GenomicRegion("chr1", 0, 10))
set1.add(GenomicRegion("chr1", 15, 20))
set1.add(GenomicRegion("chr1", 30, 45))
print(set1.sequences)
set2 = GenomicRegionSet("B")
set2.add(GenomicRegion("chr1", 0, 5))
set2.add(GenomicRegion("chr1", 10, 25))
set2.add(GenomicRegion("chr1", 35, 45))
print(set2.sequences)

jaccard2 = jaccardIndex(set1, set2)
print("jaccard2", jaccard2)


def intersect(gnrsA, gnrsB, overlap_type):
    # Convert to ctypes
Пример #22
0
def match_single(motif,
                 sequence,
                 genomic_region,
                 unique_threshold=None,
                 normalize_bitscore=True,
                 sort=False):
    """
    Performs motif matching given sequence and the motif.pssm passed as parameter.
    The genomic_region is needed to evaluate the correct binding position.
    Please note that the arguments should be passed as a list, to allow for parallelization
    mapping function.

    Keyword arguments:
    motif -- TODO.
    sequence -- A DNA sequence (string).
    genomic_region -- A GenomicRegion.
    output_file -- TODO.  
    unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and
                        then accepting only the motif matches with bitscore/motif_length >= unique_threshold.
        
    Return:
    Print MPBSs to output_file.
    """

    # Establishing threshold
    if unique_threshold:
        current_threshold = 0.0
        eval_threshold = unique_threshold
        motif_max = motif.max / motif.len
    else:
        current_threshold = motif.threshold
        eval_threshold = motif.threshold
        motif_max = motif.max

    # Performing motif matching
    try:
        # old MOODS version
        results = MOODS.search(sequence, [motif.pssm_list],
                               current_threshold,
                               absolute_threshold=True,
                               both_strands=True)
    except:
        # TODO: we can expand this to use bg from sequence, for example,
        # or from organism.
        bg = MOODS.tools.flat_bg(4)
        results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg,
                                      [current_threshold], 7)

    grs = GenomicRegionSet("mpbs")

    for search_result in results:
        for r in search_result:
            try:
                position = r.pos
                score = r.score
            except:
                (position, score) = r

            # Verifying unique threshold acceptance
            if unique_threshold and score / motif.len < unique_threshold:
                continue

            # If match forward strand
            if position >= 0:
                p1 = genomic_region.initial + position
                strand = "+"
            # If match reverse strand
            elif not motif.is_palindrome:
                p1 = genomic_region.initial - position
                strand = "-"
            else:
                continue

            # Evaluating p2
            p2 = p1 + motif.len

            # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation)
            if normalize_bitscore:
                # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation)
                if motif_max > eval_threshold:
                    norm_score = int(((score - eval_threshold) * 1000.0) /
                                     (motif_max - eval_threshold))
                else:
                    norm_score = 1000
            else:
                # Keep the original bitscore
                if unique_threshold:
                    norm_score = score / motif.len
                else:
                    norm_score = score

            grs.add(
                GenomicRegion(genomic_region.chrom,
                              int(p1),
                              int(p2),
                              name=motif.name,
                              orientation=strand,
                              data=str(norm_score)))

    if sort:
        grs.sort()

    return grs
Пример #23
0
def call_peaks(bam, csizes, pval, min_reads, binsize, cfile=None):
    '''
    Call peaks on bam file using pvalue and binomial model.
    Returns GenomeRegionSet with peaks, and CoverageSet with signal.
    '''

    # make chromsizes region set
    rs = get_chrom_sizes_as_genomicregionset(csizes)

    print("calculating extension sizes...")
    # calculate ext size
    ext, _ = get_extension_size(bam, start=0, end=300, stepsize=5)

    print("calculating coverage...")
    # calc coverage
    cov = CoverageSet('coverageset', rs)
    cov.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True)

    # calculate cov2 for output bw
    cov2 = CoverageSet('coverageset2', rs)
    cov2.coverage_from_bam(bam_file=bam,
                           extension_size=ext,
                           paired_reads=True,
                           binsize=binsize,
                           stepsize=binsize // 2)

    if cfile is not None:
        print(f"Using control file: {cfile}")
        control = CoverageSet('contorl', rs)
        control.coverage_from_bam(bam_file=cfile, extension_size=ext)
        with np.errstate(divide='ignore', invalid='ignore'):
            norm_igg(cov, control)

        # recalc overall coverage
        cov.overall_cov = reduce(lambda x, y: np.concatenate(
            (x, y)), [cov.coverage[i] for i in range(len(cov.genomicRegions))])

    # total coverage
    s = np.sum(cov.overall_cov)
    # probability of event, a read in a bin, (avg reads/bin )/libsize
    p = np.mean(cov.overall_cov[cov.overall_cov > 0]) / s

    # what is the max coverage
    maxcov = np.max(cov.overall_cov)

    # create dict with probability for each count value
    mc = np.arange(0, maxcov + 1, dtype="object")
    d = {count: binom_test((count, s - count), p=p) for count in mc}

    # create GenomicRegionSet to hold peaks
    res = GenomicRegionSet('identified_peaks')

    print("calculating peaks...")
    # iterate through bins in genome, store peaks
    for i, c in enumerate(cov.overall_cov):
        if filter_bins(c, d, min_reads):
            chrom, s, e = cov.index2coordinates(i, rs)
            res.add(GenomicRegion(chrom, s, e + 1, data=d[c]))

    # merge ol peaks
    res.merge()

    # merge peaks within ext dist
    rc = res.cluster(ext)

    return rc, cov, cov2