예제 #1
0
 def __init__(self):
     import os
     self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.mappability = self.root + "mappability.bw"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     bam = bamHandler.openBam(self.bamFile)
     tbit = py2bit.open(self.tbitFile)
     global debug
     debug = 0
     global global_vars
     global_vars = {'2bit': self.tbitFile,
                    'bam': self.bamFile,
                    'filter_out': None,
                    'mappability': self.mappability,
                    'extra_sampling_file': None,
                    'max_reads': 5,
                    'min_reads': 0,
                    'min_reads': 0,
                    'reads_per_bp': 0.3,
                    'total_reads': bam.mapped,
                    'genome_size': sum(tbit.chroms().values())
                    }
예제 #2
0
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit.chroms(chromNameBit) < i + regionSize:
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
        except Exception as detail:
            if verbose:
                print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
                print(detail)
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
예제 #3
0
 def __init__(self):
     import os
     self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.mappability = self.root + "mappability.bw"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     bam = bamHandler.openBam(self.bamFile)
     tbit = py2bit.open(self.tbitFile)
     global debug
     debug = 0
     global global_vars
     global_vars = {'2bit': self.tbitFile,
                    'bam': self.bamFile,
                    'filter_out': None,
                    'mappability': self.mappability,
                    'extra_sampling_file': None,
                    'max_reads': 5,
                    'min_reads': 0,
                    'min_reads': 0,
                    'reads_per_bp': 0.3,
                    'total_reads': bam.mapped,
                    'genome_size': sum(tbit.chroms().values())
                    }
예제 #4
0
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit.chroms(chromNameBit) < i + regionSize:
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
        except Exception as detail:
            if verbose:
                print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
                print(detail)
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
예제 #5
0
 def testHardMaskedBlocks(self):
     tb = py2bit.open(self.fname, True)
     assert (tb.hardMaskedBlocks("chr1") == [(0, 50), (100, 150)])
     assert (tb.hardMaskedBlocks("chr1", 25, 75) == [(0, 50)])
     assert (tb.hardMaskedBlocks("chr1", 75, 100) == [])
     assert (tb.hardMaskedBlocks("chr1", 75, 101) == [(100, 150)])
     assert (tb.hardMaskedBlocks("chr2") == [(50, 100)])
     tb.close()
예제 #6
0
    def _method_py2bit(self, *args, **kwargs):

        import py2bit
        data = py2bit.open(self.infile)
        with open(self.outfile, "w") as fout:
            for chrom in sorted(data.chroms()):
                seq = data.sequence(chrom)
                fout.write(">{}\n{}\n".format(chrom, seq))
예제 #7
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testInfo(self):
     tb = py2bit.open(self.fname, True)
     correct = {'file size': 161, 'nChroms': 2, 'sequence length': 250, 'hard-masked length': 150, 'soft-masked length': 8}
     check = tb.info()
     assert(len(correct) == len(check))
     for k, v in check.items():
         assert(correct[k] == v)
     tb.close()
예제 #8
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testHardMaskedBlocks(self):
     tb = py2bit.open(self.fname, True)
     assert(tb.hardMaskedBlocks("chr1") == [(0, 50), (100, 150)])
     assert(tb.hardMaskedBlocks("chr1", 25, 75) == [(0, 50)])
     assert(tb.hardMaskedBlocks("chr1", 75, 100) == [])
     assert(tb.hardMaskedBlocks("chr1", 75, 101) == [(100, 150)])
     assert(tb.hardMaskedBlocks("chr2") == [(50, 100)])
     tb.close()
예제 #9
0
 def testChroms(self):
     tb = py2bit.open(self.fname, True)
     chroms = tb.chroms()
     correct = {'chr1': 150, 'chr2': 100}
     for k, v in chroms.items():
         assert (correct[k] == v)
     assert (tb.chroms("chr1") == 150)
     assert (tb.chroms("c") is None)
     tb.close()
예제 #10
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testChroms(self):
     tb = py2bit.open(self.fname, True)
     chroms = tb.chroms()
     correct = {'chr1': 150, 'chr2': 100}
     for k, v in chroms.items():
         assert(correct[k] == v)
     assert(tb.chroms("chr1") == 150)
     assert(tb.chroms("c") is None)
     tb.close()
예제 #11
0
 def _get_seq(self, chrom, start, stop):
     if self.in_mem:
         seq = self.twobit[chrom][start:stop]
     else:
         if self.thread_safe:
             twobit = py2bit.open(self.twobit)
             seq = np.array(list(twobit.sequence(chrom, start, stop)))
             twobit.close()
         else:
             seq = np.array(list(self.twobit.sequence(chrom, start, stop)))
     return seq
예제 #12
0
 def testInfo(self):
     tb = py2bit.open(self.fname, True)
     correct = {
         'file size': 161,
         'nChroms': 2,
         'sequence length': 250,
         'hard-masked length': 150,
         'soft-masked length': 8
     }
     check = tb.info()
     assert (len(correct) == len(check))
     for k, v in check.items():
         assert (correct[k] == v)
     tb.close()
예제 #13
0
 def testSequence(self):
     tb = py2bit.open(self.fname, True)
     assert (
         tb.sequence("chr1") ==
         "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"
     )
     assert (
         tb.sequence("chr1", 0, 1000) ==
         "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"
     )
     assert (tb.sequence(
         "chr1", 24,
         74) == "NNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATC")
     tb.close()
예제 #14
0
 def __init__(self,
              twobit_file,
              alpha='dna',
              one_hot=True,
              channel_last=True,
              in_mem=False,
              thread_safe=False):
     super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe)
     self.twobit = py2bit.open(twobit_file)
     self._chroms = list(self.twobit.chroms().keys())
     self._chroms_size = self.twobit.chroms()
     if in_mem:
         twobit_onehot_dict = self._encode_seqs(self.twobit)
         self.twobit.close()
         self.twobit = twobit_onehot_dict
         self.thread_safe = True
     else:
         if thread_safe:
             self.twobit.close()
             self.twobit = twobit_file
예제 #15
0
def test_indels():
    tb = py2bit.open('tests/data/test.2bit')
    #tb = py2bit.open('/Users/sobe/Data/2bit/hg38.2bit')
    #seq ='CTCGCGCGTT'
    #for i in range(1000000, 400700100):
    #    if i % 1000000 == 0:
    #        print(i)
    #    if seq == tb.sequence("chr1", i, i+len(seq)):
    #        print(i, seq)

    #test insertion
    possible_pos = utils.get_possible_indel_pos("chr1:1012200-1012400", 106,
                                                "C", "CCT", tb, 1)
    assert (possible_pos == [(106, 'CT'), (107, 'TC'), (108, 'CT')])
    kmers = []
    for rpos, ralt in possible_pos:
        c1, c2 = utils.get_indel_contexts("chr1:1012200-1012400", rpos, 2, tb)
        kmers.append(c1)
    assert (kmers == ['ACCT', 'CCTT', 'CTTA'])

    #test deletion start
    possible_pos = utils.get_possible_indel_pos("chr1:40884100-40884200", 43,
                                                "TCG", "T", tb, 1)
    assert (possible_pos == [(43, 'CG'), (44, 'GC'), (45, 'CG'), (46, 'GC'),
                             (47, 'CG')])
    kmers = []
    for rpos, ralt in possible_pos:
        c1, c2 = utils.get_indel_contexts("chr1:40884100-40884200", rpos, 2,
                                          tb)
        kmers.append(c1)
    assert (kmers == ['CTCG', 'TCGC', 'CGCG', 'GCGC', 'CGCG'])

    #test deltion end
    kmers = []
    for rpos, ralt in possible_pos:
        c1, c2 = utils.get_indel_contexts("chr1:40884100-40884200",
                                          rpos + (len("TCG") - len("T")), 2,
                                          tb)
        kmers.append(c1)
    assert (kmers == ['CGCG', 'GCGC', 'CGCG', 'GCGT', 'CGTT'])
    kmers = []
예제 #16
0
파일: bourbon.py 프로젝트: ray2g/Misc
def main():
    parser = argparse.ArgumentParser(add_help=True, description="Bourbon finds contiguous regions without repeats (low peat content) of a minimum size and without genes within some distance. Output is written to the terminal. Note that this program currently ignores the ends of chromosomes.")
    parser.add_argument("rmsk", help="Repeat masker file")
    parser.add_argument("gtf", help="GTF file")
    parser.add_argument("tbit", help="2bit file")
    parser.add_argument("--minimumProof", type=int, default=15000, help="Minimum size of a repeat-free region (default %(default)s)")
    parser.add_argument("--wobble", type=int, default=5000, help="Ensure no genes are within this distance of a region of interest (default %(default)s)")
    parser.add_argument("--legalBAC", type=float, default=0.01, help="Maximum N content (default %(default)s)")
    args = parser.parse_args()

    # Produce a header
    print("Chromosome\tStart\tEnd")

    genes = GTF(args.gtf)
    rmsk = open(args.rmsk)
    tb = py2bit.open(args.tbit)

    lastChrom = None
    lastEnd = 0
    for line in rmsk:
        if line.startswith("#"):
            continue
        cols = line.strip().split()
        chrom = cols[5]
        start = int(cols[6]) - 1
        end = int(cols[7])
        if chrom == lastChrom:
            if start - lastEnd >= args.minimumProof:
                ROIstart = lastEnd
                ROIend = start
                blocks = splitByGenes(chrom, ROIstart, ROIend, genes, args.wobble)
                for block in blocks:
                    if block[1] - block[0] < args.minimumProof:
                        continue
                    if not highN(chrom, block[0], block[1], tb, args.legalBAC):
                        print("{}\t{}\t{}".format(chrom, block[0], block[1]))
        lastChrom = chrom
        lastEnd = end

    rmsk.close()
    tb.close()
예제 #17
0
 def testBases(self):
     tb = py2bit.open(self.fname, True)
     assert (tb.bases("chr1") == {
         'A': 0.08,
         'C': 0.08,
         'T': 0.08666666666666667,
         'G': 0.08666666666666667
     })
     assert (tb.bases("chr1", 24, 74) == {
         'A': 0.12,
         'C': 0.12,
         'T': 0.12,
         'G': 0.12
     })
     assert (tb.bases("chr1", 24, 74, False) == {
         'A': 6,
         'C': 6,
         'T': 6,
         'G': 6
     })
     tb.close()
def openInputFiles(ctrlbwFile, expbwFile, tbFile, mappableFile):
	global CTRLBW
	global EXPBW
	global TB

	global CTRLBW_SUM
	global EXPBW_SUM

	global CTRLBW_NUM
	global EXPBW_NUM

	global fileDir
	global MAPPABLE

	MAPPABLE = mappableFile

	dirname = ctrlbwFile[0] + "_" + expbwFile[0]
	fileDir = "/data/reddylab/YoungSook/kmer/kmer_script/modeling/modeling_onebp_DNA_RNA_ver4/" + dirname
	
	CTRLBW_NUM = len(ctrlbwFile)
	EXPBW_NUM = len(expbwFile) 

	CTRLBW = [0] * CTRLBW_NUM
	os.chdir("/data/reddylab/YoungSook/kmer/kmer_input/etoh_deep/EtOH_rep") 
        for i in range(CTRLBW_NUM):
		CTRLBW[i] = pyBigWig.open(ctrlbwFile[i])

	EXPBW = [0] * EXPBW_NUM
	os.chdir("/data/reddylab/YoungSook/kmer/kmer_input/etoh_deep/EtOH_rep/onebp_model/rna")
	for i in range(EXPBW_NUM):
		EXPBW[i] = pyBigWig.open(expbwFile[i])
	
        ## CALCULATE CTRLBW_SUM
	CTRLBW_SUM = float(CTRLBW[0].header().get('sumData'))
        
	#EXPBW_SUM = float(EXPBW.header().get('sumData')) 
	os.chdir("/data/reddylab/YoungSook/ref_genome/hg38")
        TB = py2bit.open(tbFile) 
예제 #19
0
 def testOpenClose(self):
     tb = py2bit.open(self.fname, True)
     assert (tb is not None)
     tb.close()
예제 #20
0
 def testSoftMaskedBlocks(self):
     tb = py2bit.open(self.fname, storeMasked=True)
     assert (tb.softMaskedBlocks("chr1") == [(62, 70)])
     assert (tb.softMaskedBlocks("chr1", 0, 50) == [])
     tb.close()
예제 #21
0
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
                             fragmentLength,
                             chrNameBamToBit, verbose=False):
    r""" given genome regions, the GC content of the genome is tabulated for
    fragments of length 'fragmentLength' each 'stepSize' positions.

    >>> test = Tester()
    >>> args = test.testTabulateGCcontentWorker()
    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)

    The forward read positions are:
    [1,  4,  10, 10, 16, 18]
    which correspond to a GC of
    [1,  1,  1,  1,  2,  1]

    The evaluated position are
    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
    the corresponding GC is
    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]

    >>> print(N_gc)
    [0 4 5 1]
    >>> print(F_gc)
    [0 4 1 0]
    >>> test.set_filter_out_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}

    Test for the filter out option
    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)
    >>> test.unset_filter_out_file()

    The evaluated positions are
    [ 0  2  8 10 12 14 16 18]
    >>> print(N_gc)
    [0 3 4 1]
    >>> print(F_gc)
    [0 3 1 0]

    Test for extra_sampling option
    >>> test.set_extra_sampling_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}
    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)

    The new positions evaluated are
    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
    and the GC is
    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
    >>> print(res[0])
    [1 5 5 1]
    >>> print(res[1])
    [0 5 1 0]

    """
    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    chromNameBit = chrNameBamToBit[chromNameBam]

    # array to keep track of the GC from regions of length 'fragmentLength'
    # from the genome. The index of the array is used to
    # indicate the gc content. The values inside the
    # array are counts. Thus, if N_gc[10] = 3, that means
    # that 3 regions have a gc_content of 10.
    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    peak = 0
    startTime = time.time()

    if verbose:
        print("[{:.3f}] computing positions to "
              "sample".format(time.time() - startTime))

    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    read_counts = []
    # Optimize IO.
    # if the sample regions are far apart from each
    # other is faster to go to each location and fetch
    # the reads found there.
    # Otherwise, if the regions to sample are close to
    # each other, is faster to load all the reads in
    # a large region into memory and consider only
    # those falling into the positions to sample.
    # The following code gets the reads
    # that are at sampling positions that lie close together
    if np.mean(np.diff(positions_to_sample)) < 1000:
        start_pos = min(positions_to_sample)
        end_pos = max(positions_to_sample)
        if verbose:
            print("[{:.3f}] caching reads".format(time.time() - startTime))

        counts = np.bincount([r.pos - start_pos
                              for r in bam.fetch(chromNameBam, start_pos,
                                                 end_pos + 1)
                              if not r.is_reverse and r.pos >= start_pos],
                             minlength=end_pos - start_pos + 2)

        read_counts = counts[positions_to_sample - min(positions_to_sample)]
        if verbose:
            print("[{:.3f}] finish caching reads.".format(
                time.time() - startTime))

    countTime = time.time()

    c = 1
    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if the end of the chromosome is reached
        if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
        except Exception as detail:
            if verbose:
                print(detail)
            continue

        subN_gc[gc] += 1

        # count all reads at position 'i'
        if len(read_counts) == 0:  # case when no cache was done
            num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
                             if x.is_reverse is False and x.pos == i])
        else:
            num_reads = read_counts[index]

        if num_reads >= global_vars['max_reads']:
            peak += 1
            continue

        subF_gc[gc] += num_reads
        if verbose:
            if index % 50000 == 0:
                endTime = time.time()
                print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
                      (multiprocessing.current_process().name,
                       index, index / (endTime - countTime),
                       chromNameBit, start, end, stepSize))
        c += 1

    if verbose:
        endTime = time.time()
        print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
              (multiprocessing.current_process().name,
               index, index / (endTime - countTime),
               chromNameBit, start, end, stepSize))
        print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
                                                    (endTime - startTime), chromNameBit, start, end, stepSize))

    return(subN_gc, subF_gc)
예제 #22
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    if args.extraSampling:
        extra_sampling_file = args.extraSampling.name
        args.extraSampling.close()
    else:
        extra_sampling_file = None

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile
    global_vars['filter_out'] = args.blackListFileName
    global_vars['extra_sampling_file'] = extra_sampling_file

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])

    if args.fragmentLength:
        fragment_len_dict = \
            {'median': args.fragmentLength}

    else:
        fragment_len_dict, __ = \
            get_read_and_fragment_length(args.bamfile, None,
                                         numberOfProcessors=args.numberOfProcessors,
                                         verbose=args.verbose)
        if not fragment_len_dict:
            print("\nPlease provide the fragment length used for the "
                  "sample preparation.\n")
            exit(1)

        fragment_len_dict = {'median': int(fragment_len_dict['median'])}

    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    confidence_p_value = float(1) / args.sampleSize

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    # use poisson distribution to identify peaks that should be discarted.
    # I multiply by 4, because the real distribution of reads
    # vary depending on the gc content
    # and the global number of reads per bp may a be too low.
    # empirically, a value of at least 4 times as big as the
    # reads_per_bp was found.
    # Similarly for the min value, I divide by 4.
    global_vars['max_reads'] = \
        poisson(4 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).isf(confidence_p_value)
    # this may be of not use, unless the depth of sequencing is really high
    # as this value is close to 0
    global_vars['min_reads'] = \
        poisson(0.25 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).ppf(confidence_p_value)

    for key in global_vars:
        print("{}: {}".format(key, global_vars[key]))

    print("computing frequencies")
    # the GC of the genome is sampled each stepSize bp.
    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
    print("stepSize: {}".format(stepSize))
    data = tabulateGCcontent(fragment_len_dict,
                             chrNameBitToBam, stepSize,
                             chromSizes,
                             numberOfProcessors=args.numberOfProcessors,
                             verbose=args.verbose,
                             region=args.region)

    np.savetxt(args.GCbiasFrequenciesFile.name, data)

    if args.biasPlot:
        reads_per_gc = countReadsPerGC(args.regionSize,
                                       chrNameBitToBam, stepSize * 10,
                                       chromSizes,
                                       numberOfProcessors=args.numberOfProcessors,
                                       verbose=args.verbose,
                                       region=args.region)
        plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
# print Headings
print("Chromosome",
      "Feature",
      "Start",
      "End",
      "Strand",
      "CpG_ratio",
      "Gpc_ratio",
      end='\t',
      file=fout1)
# "ApA_ratio","ApT_ratio","ApG_ratio","ApC_ratio","TpA_ratio","TpT_ratio","TpG_ratio","TpC_ratio","GpA_ratio","GpT_ratio","GpG_ratio","GpC_ratio","CpA_ratio","CpT_ratio","CpG_ratio","CpC_ratio"

bit_file = sys.argv[1]
gff_file = sys.argv[2]
bd = py2bit.open(bit_file, True)


def extract_exons():

    with open(gff_file) as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            content = line.strip().split('\t')
            if chr_name == '' or chr_name != content[0]:
                chr_name = content[0]
                chr_seq_length = len()
            chrom = content[0]
            feature = content[2]
            start = int(content[3])
예제 #24
0
 def testSequence(self):
     tb = py2bit.open(self.fname, True)
     assert (tb.sequence("chr1", 1, 3) == "NN")
     assert (tb.sequence("chr1", 1, 2) == "N")
     tb.close()
예제 #25
0
def writeCorrectedSam_worker(chrNameBam,
                             chrNameBit,
                             start,
                             end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a BAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> try:
    ...     import StringIO
    ... except ImportError:
    ...     from io import StringIO
    >>> ostdout = sys.stdout
    >>> import tempfile
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print("Sam for %s %s %s " % (chrNameBit, start, end))
    i = 0

    tbit = py2bit.open(global_vars['2bit'])

    bam = pysam.Samfile(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.bam')

    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0
    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [
        r for r in bam.fetch(chrNameBam, start, end)
        if r.pos > start and r.flag & 4 == 0
    ]

    r_index = -1
    for read in reads:
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del (matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        # Each tag is a tuple of (tag name, value, type)
        # Note that get_tags() returns ord(type) rather than type and this must
        # be fixed!
        # It turns out that the "with_value_type" option only started working in
        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
        # potentially creating BAM files that break HTSJDK/IGV/etc.

        readTag = read.get_tags(with_value_type=True)
        replace_tags = False
        if len(readTag) > 0:
            if len(readTag[0]) == 3:
                if type(readTag[2]) is int:
                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
                replace_tags = True
        else:
            replace_tags = True

        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2))
            readTag.append(('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
            readTag.append(('YN', copies, "i"))
        else:
            GC = -1

        readTag.append(('YG', GC, "i"))
        if replace_tags:
            read.set_tags(readTag)

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies, 'gc': gc}
        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print("{},  processing {} ({:.1f} per sec) reads "
                      "@ {}:{}-{}".format(
                          multiprocessing.current_process().name, i,
                          i / (endTime - startTime), chrNameBit, start, end))
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print("{},  processing {} ({:.1f} per sec) reads "
              "@ {}:{}-{}".format(multiprocessing.current_process().name, i,
                                  i / (endTime - startTime), chrNameBit, start,
                                  end))
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print("duplicated reads removed %d of %d (%.2f) " %
              (removed_duplicated_reads, len(reads), percentage))

    return tempFileName
예제 #26
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testSoftMaskedBlocks(self):
     tb = py2bit.open(self.fname, storeMasked=True)
     assert(tb.softMaskedBlocks("chr1") == [(62, 70)])
     assert(tb.softMaskedBlocks("chr1", 0, 50) == [])
     tb.close()
예제 #27
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testSequence(self):
     tb = py2bit.open(self.fname, True)
     assert(tb.sequence("chr1", 1, 3) == "NN")
     assert(tb.sequence("chr1", 1, 2) == "N")
     tb.close()
예제 #28
0
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
                             fragmentLength,
                             chrNameBamToBit, verbose=False):
    r""" given genome regions, the GC content of the genome is tabulated for
    fragments of length 'fragmentLength' each 'stepSize' positions.

    >>> test = Tester()
    >>> args = test.testTabulateGCcontentWorker()
    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)

    The forward read positions are:
    [1,  4,  10, 10, 16, 18]
    which correspond to a GC of
    [1,  1,  1,  1,  2,  1]

    The evaluated position are
    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
    the corresponding GC is
    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]

    >>> print(N_gc)
    [0 4 5 1]
    >>> print(F_gc)
    [0 4 1 0]
    >>> test.set_filter_out_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}

    Test for the filter out option
    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)
    >>> test.unset_filter_out_file()

    The evaluated positions are
    [ 0  2  8 10 12 14 16 18]
    >>> print(N_gc)
    [0 3 4 1]
    >>> print(F_gc)
    [0 3 1 0]

    Test for extra_sampling option
    >>> test.set_extra_sampling_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}
    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)

    The new positions evaluated are
    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
    and the GC is
    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
    >>> print(res[0])
    [1 5 5 1]
    >>> print(res[1])
    [0 5 1 0]

    """
    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    chromNameBit = chrNameBamToBit[chromNameBam]

    # array to keep track of the GC from regions of length 'fragmentLength'
    # from the genome. The index of the array is used to
    # indicate the gc content. The values inside the
    # array are counts. Thus, if N_gc[10] = 3, that means
    # that 3 regions have a gc_content of 10.
    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    peak = 0
    startTime = time.time()

    if verbose:
        print("[{:.3f}] computing positions to "
              "sample".format(time.time() - startTime))

    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    read_counts = []
    # Optimize IO.
    # if the sample regions are far apart from each
    # other is faster to go to each location and fetch
    # the reads found there.
    # Otherwise, if the regions to sample are close to
    # each other, is faster to load all the reads in
    # a large region into memory and consider only
    # those falling into the positions to sample.
    # The following code gets the reads
    # that are at sampling positions that lie close together
    if np.mean(np.diff(positions_to_sample)) < 1000:
        start_pos = min(positions_to_sample)
        end_pos = max(positions_to_sample)
        if verbose:
            print("[{:.3f}] caching reads".format(time.time() - startTime))

        counts = np.bincount([r.pos - start_pos
                              for r in bam.fetch(chromNameBam, start_pos,
                                                 end_pos + 1)
                              if not r.is_reverse and r.pos >= start_pos],
                             minlength=end_pos - start_pos + 2)

        read_counts = counts[positions_to_sample - min(positions_to_sample)]
        if verbose:
            print("[{:.3f}] finish caching reads.".format(
                time.time() - startTime))

    countTime = time.time()

    c = 1
    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if the end of the chromosome is reached
        if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
        except Exception as detail:
            if verbose:
                print(detail)
            continue

        subN_gc[gc] += 1

        # count all reads at position 'i'
        if len(read_counts) == 0:  # case when no cache was done
            num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
                             if x.is_reverse is False and x.pos == i])
        else:
            num_reads = read_counts[index]

        if num_reads >= global_vars['max_reads']:
            peak += 1
            continue

        subF_gc[gc] += num_reads
        if verbose:
            if index % 50000 == 0:
                endTime = time.time()
                print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
                      (multiprocessing.current_process().name,
                       index, index / (endTime - countTime),
                       chromNameBit, start, end, stepSize))
        c += 1

    if verbose:
        endTime = time.time()
        print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
              (multiprocessing.current_process().name,
               index, index / (endTime - countTime),
               chromNameBit, start, end, stepSize))
        print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
                                                    (endTime - startTime), chromNameBit, start, end, stepSize))

    return(subN_gc, subF_gc)
예제 #29
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testSequence(self):
     tb = py2bit.open(self.fname, True)
     assert(tb.sequence("chr1") == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")
     assert(tb.sequence("chr1", 0, 1000) == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")
     assert(tb.sequence("chr1", 24, 74) == "NNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATC")
     tb.close()
예제 #30
0
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
    r"""writes a bedgraph file containing the GC correction of
    a region from the genome

    >>> test = Tester()
    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
    >>> open(tempFile, 'r').readlines()
    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
    >>> os.remove(tempFile)
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    cvg_corr = np.zeros(end - start)

    i = 0

    tbit = py2bit.open(global_vars['2bit'])
    bam = openBam(global_vars['bam'])
    read_repetitions = 0
    removed_duplicated_reads = 0
    startTime = time.time()

    # caching seems to be faster
    # r.flag & 4 == 0 is to skip unmapped
    # reads that nevertheless are asigned
    # to a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end)
             if r.flag & 4 == 0]

    bam.close()

    r_index = -1
    for read in reads:
        if read.is_unmapped:
            continue
        r_index += 1
        try:
            # calculate GC content of read fragment
            gc = getReadGCcontent(tbit, read, fragmentLength,
                                  chrNameBit)
        except Exception as detail:
            print(detail)
            """ this exception happens when the end of a
            chromosome is reached """
            continue
        if not gc:
            continue

        # is this read in the same orientation and position as the previous?
        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
                read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                removed_duplicated_reads += 1
                continue
        else:
            read_repetitions = 0

        try:
            fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True)
            vectorStart = max(fragmentStart - start, 0)
            vectorEnd = min(fragmentEnd - start, end - start)
        except TypeError:
            # the get_fragment_from_read functions returns None in some cases.
            # Those cases are to be skipped, hence the continue line.
            continue

        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
        i += 1

    try:
        if debug:
            endTime = time.time()
            print("{}, processing {} ({:.1f} per sec) ")
            "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
                                      i, i / (endTime - startTime),
                                      chrNameBit, start, end)
    except NameError:
        pass

    if i == 0:
        return None

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    # save in bedgraph format
    for bin in range(0, len(cvg_corr), step):
        value = np.mean(cvg_corr[bin:min(bin + step, end)])
        if value > 0:
            writeStart = start + bin
            writeEnd = min(start + bin + step, end)
            _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart,
                                                writeEnd, value))

    tempFileName = _file.name
    _file.close()
    return tempFileName
예제 #31
0
def main(args=None):
    """
    Run the main program.

    This function is executed when you type `kmer_counter` or `python -m kmer_counter`.

    Arguments:
        args: Arguments passed from the command line.

    Returns:
        An exit code.
    """
    parser = argparse.ArgumentParser(prog='kmer_counter',
                                     description='''
        Count k-mers at SNVs, indel breakpoints or in genomic regions
    ''')
    subparsers = parser.add_subparsers(
        dest='command',
        help='command. Must choose what kind of file you want to count.')
    parser.add_argument('-V', '--version', action='store_true')

    snv_parser = subparsers.add_parser('snv',
                                       description='''Count k-mers at SNVs.
        The k-mer will be centered around the mutation base and if the reference base is G or T then the reverse_complement of the reference k-mer will be counted instead.
        ''')
    snv_parser.add_argument('ref_genome',
                            help='Reference genome in 2bit format',
                            type=str)
    snv_parser.add_argument(
        'mutations',
        type=argparse.FileType('r'),
        help=
        'A vcf-like file with SNVs. First four columns should be: Chrom, pos, ref, alt. '
        'Other columns are ignored. Non-SNV variants are ignored.')
    snv_parser.add_argument(
        '-r',
        '--radius',
        type=int,
        metavar='R',
        default=1,
        help='Count the k-mer from R bases before a position to R bases '
        'after a position. For each position in the inputfile.')

    indel_parser = subparsers.add_parser('indel',
                                         description='Count k-mers at indels.')
    indel_parser.add_argument('ref_genome',
                              help='Reference genome in 2bit format',
                              type=str)
    indel_parser.add_argument(
        'mutations',
        type=argparse.FileType('r'),
        help=
        'A sorted vcf-like file with indels. First four columns should be: Chrom, pos, ref, alt. '
        'Other columns are ignored. Non-indel variants are ignored. Indels should be left-aligned.'
    )
    indel_parser.add_argument(
        'type',
        choices=['ins', 'del_start', 'del_end', 'all', 'del'],
        help='What type of indel breakpoint do you want to count?')
    indel_parser.add_argument(
        '-r',
        '--radius',
        default=1,
        type=int,
        help=
        'How many base pairs before indel_start_point or after indel_end_point should be included '
        'as context annotation.')
    indel_parser.add_argument(
        '--sample',
        action="store_true",
        help=
        'Randomly choose one of the possible positions instead of counting the expected (non integer) count for each possible position of an ambigously aligned indel.'
    )
    #indel_parser.add_argument('-v', '--verbose', action='store_true')

    bg_parser = subparsers.add_parser(
        'background', description='Count kmers in (regions of) a genome')
    bg_parser.add_argument(
        'ref_genome',
        type=str,
        help='Reference genome in 2bit format',
    )
    bg_parser.add_argument(
        '--bed',
        type=str,
        help=
        'bed-file describing regions that should be counted. May be gzipped.')
    #bg_parser.add_argument('--wig', type=str,
    #    help='wig-file describing regions that should be counted. May be gzipped. '
    #        'The context at a position will be weigthed by the value from the '
    #        'wig-file at that position. The output counts will thus be floats '
    #        'and not integers')
    bg_parser.add_argument('--all_autosomes',
                           action="store_true",
                           help='All parts of the autosomes will be counted')
    bg_parser.add_argument(
        '-r',
        '--radius',
        type=int,
        metavar='R',
        help='Count the k-mer from R bases before a position to R bases '
        'after a position. For each position in the inputfile.')
    bg_parser.add_argument(
        '--before_after',
        type=int,
        nargs=2,
        metavar=('X', 'Y'),
        help=
        'count the k-mer from X bases before a position to Y bases after a position. '
        'For each position in the inputfile.')
    bg_parser.add_argument(
        '--reverse_complement_method',
        type=str,
        choices=['none', 'middle', 'lexicographic', 'both'],
        help=
        '"none" means that alle k-mers are counted unchanged. "middle" means that the reverse complement of a k-mer is counted if the middle position is not a "A" or "C". "lexicographic" means that the reverse_complement is counted if it has a smaller lexicographic order. Default is "middle" if --radius option is used and "lexicographic" if --before_after is used.'
    )

    args = parser.parse_args(args)

    if args.version:
        from kmer_counter import __version__
        print("version:", __version__)
        print()
        return 0

    if args.command not in ['snv', 'indel', 'background']:
        print('Error: must specify command.')
        print()
        parser.print_help()
        return 0

    if 'ref_genome' not in args:
        print('Error: ref_genome (as 2bit file) must be specified')
        print()
        parser.print_help()
        return 0

    tb = py2bit.open(args.ref_genome)

    if args.command == 'indel':
        kmer_count = count_indels(args.mutations, tb, args.type, args.radius,
                                  args.sample)
    elif args.command == 'snv':
        dreader = PosReader(args.mutations, tb)
        kmer_count = count_non_indels(tb, dreader, args.radius, args.radius,
                                      'middle')
    elif args.command == 'background':
        if args.radius is None == args.before_after is None:
            raise Exception(
                'Either the --radius or the --before_after option should be used (not both).'
            )
        if not args.radius is None:
            assert args.radius > 0
            before = args.radius
            after = args.radius
            if args.reverse_complement_method is None:
                args.reverse_complement_method = "middle"
        else:
            before, after = args.before_after
            assert before >= 0
            assert after >= 0
            if args.reverse_complement_method is None:
                args.reverse_complement_method = "none"

        if not args.bed is None:
            if args.all_autosomes:
                raise Exception(
                    'Either --bed or--all_autosomes option should be used. Not both.'
                )
            if args.bed.endswith('.bed.gz'):
                dreader = BedReader(gzip.open(args.bed, 'rt'))
            elif args.bed.endswith('.bed'):
                dreader = BedReader(open(args.bed))
            elif args.bed == '-':
                dreader = BedReader(sys.stdin)
            else:
                raise Exception('bed file should end with ".bed" or ".bed.gz"')
        elif args.all_autosomes:
            dreader = AllAutoReader(tb)
        else:
            raise Exception(
                'Either --bed or--all_autosomes option should be used')
        kmer_count = count_non_indels(tb, dreader, before, after,
                                      args.reverse_complement_method)

    for x in kmer_count:
        print(x, kmer_count[x])

    return 0
예제 #32
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    if args.extraSampling:
        extra_sampling_file = args.extraSampling.name
        args.extraSampling.close()
    else:
        extra_sampling_file = None

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile
    global_vars['filter_out'] = args.blackListFileName
    global_vars['extra_sampling_file'] = extra_sampling_file

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])

    if args.fragmentLength:
        fragment_len_dict = \
            {'median': args.fragmentLength}

    else:
        fragment_len_dict, __ = \
            get_read_and_fragment_length(args.bamfile, None,
                                         numberOfProcessors=args.numberOfProcessors,
                                         verbose=args.verbose)
        if not fragment_len_dict:
            print("\nPlease provide the fragment length used for the "
                  "sample preparation.\n")
            exit(1)

        fragment_len_dict = {'median': int(fragment_len_dict['median'])}

    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    confidence_p_value = float(1) / args.sampleSize

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    # use poisson distribution to identify peaks that should be discarted.
    # I multiply by 4, because the real distribution of reads
    # vary depending on the gc content
    # and the global number of reads per bp may a be too low.
    # empirically, a value of at least 4 times as big as the
    # reads_per_bp was found.
    # Similarly for the min value, I divide by 4.
    global_vars['max_reads'] = \
        poisson(4 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).isf(confidence_p_value)
    # this may be of not use, unless the depth of sequencing is really high
    # as this value is close to 0
    global_vars['min_reads'] = \
        poisson(0.25 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).ppf(confidence_p_value)

    for key in global_vars:
        print("{}: {}".format(key, global_vars[key]))

    print("computing frequencies")
    # the GC of the genome is sampled each stepSize bp.
    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
    print("stepSize: {}".format(stepSize))
    data = tabulateGCcontent(fragment_len_dict,
                             chrNameBitToBam, stepSize,
                             chromSizes,
                             numberOfProcessors=args.numberOfProcessors,
                             verbose=args.verbose,
                             region=args.region)

    np.savetxt(args.GCbiasFrequenciesFile.name, data)

    if args.biasPlot:
        reads_per_gc = countReadsPerGC(args.regionSize,
                                       chrNameBitToBam, stepSize * 10,
                                       chromSizes,
                                       numberOfProcessors=args.numberOfProcessors,
                                       verbose=args.verbose,
                                       region=args.region)
        plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
예제 #33
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [
        binom.isf(1e-7, F_gc[x], 1.0 /
                  N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1
        for x in range(len(F_gc))
    ]

    global_vars['max_dup_gc'] = max_dup_gc

    tbit = py2bit.open(global_vars['2bit'])
    bam = pysam.Samfile(global_vars['bam'])

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print("applying correction")
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print("genome partition size for multiprocessing: {}".format(chunkSize))
    print("using region {}".format(args.region))
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()),
                                       bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    print(chrNameBitToBam, chrNameBamToBit)
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in range(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print("no sequence information for ")
                "chromosome {} in 2bit file".format(chrom)
                print("Reads in this chromosome will be skipped")
                continue
            length = min(size, i + chunkSize)
            mp_args.append(
                (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print(("using {} processors for {} "
                   "number of tasks".format(args.numberOfProcessors,
                                            len(mp_args))))

            res = pool.map_async(writeCorrectedSam_wrapper,
                                 mp_args).get(9999999)
        else:
            res = list(map(writeCorrectedSam_wrapper, mp_args))

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print("concatenating (sorted) intermediate BAMs")
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print("indexing BAM")
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg')
        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = list(map(writeCorrected_wrapper, mp_args))

        # concatenate intermediary bedgraph files
        _temp_bg_file = open(_temp_bg_file_name, 'w')
        for tempFileName in res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file)
                os.remove(tempFileName)
        _temp_bg_file.close()
        args.correctedFile.close()

        if args.correctedFile.name.endswith('bg'):
            shutil.move(_temp_bg_file_name, args.correctedFile.name)

        else:
            chromSizes = [(k, v) for k, v in tbit.chroms().items()]
            writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name,
                                           args.correctedFile.name)
            os.remove(_temp_bg_file)
예제 #34
0
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end,
                             step=None,
                             tag_but_not_change_number=False,
                             verbose=True):
    r"""
    Writes a BAM file, deleting and adding some reads in order to compensate
    for the GC bias. **This is a stochastic method.**
    >>> np.random.seed(1)
    >>> test = Tester()
    >>> args = test.testWriteCorrectedSam()
    >>> tempFile = writeCorrectedSam_worker(*args, \
    ... tag_but_not_change_number=True, verbose=False)
    >>> try:
    ...     import StringIO
    ... except ImportError:
    ...     from io import StringIO
    >>> ostdout = sys.stdout
    >>> import tempfile
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    >>> tempFile = \
    ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\
    ... tag_but_not_change_number=True, verbose=False)
    >>> sys.stdout = tempfile.TemporaryFile()
    >>> idx = pysam.index(tempFile)
    >>> sys.stdout = ostdout
    >>> bam = pysam.Samfile(tempFile)
    >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)]
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    >>> res = os.remove(tempFile)
    >>> res = os.remove(tempFile+".bai")
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    if verbose:
        print("Sam for %s %s %s " % (chrNameBit, start, end))
    i = 0

    tbit = py2bit.open(global_vars['2bit'])

    bam = openBam(global_vars['bam'])
    tempFileName = utilities.getTempFileName(suffix='.bam')

    outfile = pysam.Samfile(tempFileName, 'wb', template=bam)
    startTime = time.time()
    matePairs = {}
    read_repetitions = 0
    removed_duplicated_reads = 0

    # cache data
    # r.flag & 4 == 0 is to filter unmapped reads that
    # have a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end)
             if r.pos > start and r.flag & 4 == 0]

    r_index = -1
    for read in reads:
        if read.pos <= start or read.is_unmapped:
            continue
        r_index += 1
        copies = None
        gc = None

        # check if a mate has already been procesed
        # to apply the same correction
        try:
            copies = matePairs[read.qname]['copies']
            gc = matePairs[read.qname]['gc']
            del(matePairs[read.qname])
        except:
            # this exception happens when a mate is
            # not present. This could
            # happen because of removal of the mate
            # by some filtering
            gc = getReadGCcontent(tbit, read, fragmentLength,
                                  chrNameBit)
            if gc:
                copies = numCopiesOfRead(float(1) / R_gc[gc])
            else:
                copies = 1
        # is this read in the same orientation and position as the previous?
        if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \
                and read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                copies = 0  # in other words do not take into account this read
                removed_duplicated_reads += 1
        else:
            read_repetitions = 0

        readName = read.qname
        # Each tag is a tuple of (tag name, value, type)
        # Note that get_tags() returns ord(type) rather than type and this must
        # be fixed!
        # It turns out that the "with_value_type" option only started working in
        # pysam-0.8.4, so we can't reliably add tags on earlier versions without
        # potentially creating BAM files that break HTSJDK/IGV/etc.

        readTag = read.get_tags(with_value_type=True)
        replace_tags = False
        if len(readTag) > 0:
            if len(readTag[0]) == 3:
                if type(readTag[2]) is int:
                    readTag = [(x[0], x[1], chr(x[2])) for x in readTag]
                replace_tags = True
        else:
            replace_tags = True

        if gc:
            GC = int(100 * np.round(float(gc) / fragmentLength,
                                    decimals=2))
            readTag.append(
                ('YC', float(round(float(1) / R_gc[gc], 2)), "f"))
            readTag.append(('YN', copies, "i"))
        else:
            GC = -1

        readTag.append(('YG', GC, "i"))
        if replace_tags:
            read.set_tags(readTag)

        if read.is_paired and read.is_proper_pair \
                and not read.mate_is_unmapped \
                and not read.is_reverse:
            matePairs[readName] = {'copies': copies,
                                   'gc': gc}

        """
        outfile.write(read)
        """
        if tag_but_not_change_number:
            outfile.write(read)
            continue

        for numCop in range(1, copies + 1):
            # the read has to be renamed such that newly
            # formed pairs will match
            if numCop > 1:
                read.qname = readName + "_%d" % (numCop)
            outfile.write(read)

        if verbose:
            if i % 500000 == 0 and i > 0:
                endTime = time.time()
                print("{},  processing {} ({:.1f} per sec) reads "
                      "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                          i, i / (endTime - startTime),
                                          chrNameBit, start, end))
        i += 1

    outfile.close()
    if verbose:
        endTime = time.time()
        print("{},  processing {} ({:.1f} per sec) reads "
              "@ {}:{}-{}".format(multiprocessing.current_process().name,
                                  i, i / (endTime - startTime),
                                  chrNameBit, start, end))
        percentage = float(removed_duplicated_reads) * 100 / len(reads) \
            if len(reads) > 0 else 0
        print("duplicated reads removed %d of %d (%.2f) " %
              (removed_duplicated_reads, len(reads), percentage))

    return tempFileName
예제 #35
0
# 1. takes bed file and 2bit file as input
# 2. based on bed file locations extract the underlying sequence and write into a fasta format
# 3. fasta output with name from bed file (column 4) if not provided use location as name

import subprocess
import py2bit  # pip install git+https://github.com/dpryan79/py2bit
import os

tb_input = "/data/repository/organisms/dm6_ensembl/genome_fasta/genome.2bit"
tb = py2bit.open(tb_input)

print "2bit file: %s" % tb_input

print "name of bed file:"
bed_input = raw_input("> ")
bed_input = os.path.abspath(bed_input)
bed = open(bed_input, 'r')

print "name of the output file:"
output_name = raw_input("> ")
output_name = output_name + ".fa"

#check whether there is a column 4 in bed file for names
line = bed.next().strip()
colnum = len(line.split("\t"))

if colnum >= 4:
    namecol = True
else:
    namecol = False
bed.close()
예제 #36
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testBases(self):
     tb = py2bit.open(self.fname, True)
     assert(tb.bases("chr1") == {'A': 0.08, 'C': 0.08, 'T': 0.08666666666666667, 'G': 0.08666666666666667})
     assert(tb.bases("chr1", 24, 74) == {'A': 0.12, 'C': 0.12, 'T': 0.12, 'G': 0.12})
     assert(tb.bases("chr1", 24, 74, False) == {'A': 6, 'C': 6, 'T': 6, 'G': 6})
     tb.close()
예제 #37
0
def main(args=None):
    args = process_args(args)
    global F_gc, N_gc, R_gc

    data = np.loadtxt(args.GCbiasFrequenciesFile.name)

    F_gc = data[:, 0]
    N_gc = data[:, 1]
    R_gc = data[:, 2]

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile

    # compute the probability to find more than one read (a redundant read)
    # at a certain position based on the gc of the read fragment
    # the binomial function is used for that
    max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x])
                  if F_gc[x] > 0 and N_gc[x] > 0 else 1
                  for x in range(len(F_gc))]

    global_vars['max_dup_gc'] = max_dup_gc

    tbit = py2bit.open(global_vars['2bit'])
    bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    # apply correction
    print("applying correction")
    # divide the genome in fragments containing about 4e5 reads.
    # This amount of reads takes about 20 seconds
    # to process per core (48 cores, 256 Gb memory)
    chunkSize = int(4e5 / global_vars['reads_per_bp'])

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    regionStart = 0
    if args.region:
        chromSizes, regionStart, regionEnd, chunkSize = \
            mapReduce.getUserRegion(chromSizes, args.region,
                                    max_chunk_size=chunkSize)

    print("genome partition size for multiprocessing: {}".format(chunkSize))
    print("using region {}".format(args.region))
    mp_args = []
    bedGraphStep = args.binSize
    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)
    chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()])
    print(chrNameBitToBam, chrNameBamToBit)
    c = 1
    for chrom, size in chromSizes:
        start = 0 if regionStart == 0 else regionStart
        for i in range(start, size, chunkSize):
            try:
                chrNameBamToBit[chrom]
            except KeyError:
                print("no sequence information for ")
                "chromosome {} in 2bit file".format(chrom)
                print("Reads in this chromosome will be skipped")
                continue
            length = min(size, i + chunkSize)
            mp_args.append((chrom, chrNameBamToBit[chrom], i, length,
                            bedGraphStep))
            c += 1

    pool = multiprocessing.Pool(args.numberOfProcessors)

    if args.correctedFile.name.endswith('bam'):
        if len(mp_args) > 1 and args.numberOfProcessors > 1:
            print(("using {} processors for {} "
                   "number of tasks".format(args.numberOfProcessors,
                                            len(mp_args))))

            res = pool.map_async(
                writeCorrectedSam_wrapper, mp_args).get(9999999)
        else:
            res = list(map(writeCorrectedSam_wrapper, mp_args))

        if len(res) == 1:
            command = "cp {} {}".format(res[0], args.correctedFile.name)
            run_shell_command(command)
        else:
            print("concatenating (sorted) intermediate BAMs")
            header = pysam.Samfile(res[0])
            of = pysam.Samfile(args.correctedFile.name, "wb", template=header)
            header.close()
            for f in res:
                f = pysam.Samfile(f)
                for e in f.fetch(until_eof=True):
                    of.write(e)
                f.close()
            of.close()

        print("indexing BAM")
        pysam.index(args.correctedFile.name)

        for tempFileName in res:
            os.remove(tempFileName)

    if args.correctedFile.name.endswith('bg') or \
            args.correctedFile.name.endswith('bw'):

        if len(mp_args) > 1 and args.numberOfProcessors > 1:

            res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999)
        else:
            res = list(map(writeCorrected_wrapper, mp_args))

        oname = args.correctedFile.name
        args.correctedFile.close()
        if oname.endswith('bg'):
            f = open(oname, 'wb')
            for tempFileName in res:
                if tempFileName:
                    shutil.copyfileobj(open(tempFileName, 'rb'), f)
                    os.remove(tempFileName)
            f.close()
        else:
            chromSizes = [(k, v) for k, v in tbit.chroms().items()]
            writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
import random
import py2bit
import sys
import os

print('Chrom', 'Feature', 'start', 'end', 'CpG_ratio', 'GpC_ratio', sep='\t')
bit_file = sys.argv[1]
tb = py2bit.open('bit_file', True)
tb_dict = tb.chroms()
chrom = list(tb_dict.keys())  #chrom_names
len_chrom = list(tb_dict.values())  #chrom_lens
count_entry = 0

while count_entry < 10000:
    j = random.choice(chrom)
    random_start = random.randrange(1, tb_dict[j])
    random_end = random_start + 1000
    if random_end > int(tb_dict[j]):
        continue
    else:
        perseq = tb.sequence(j, random_start, random_end)
        C = int(perseq.upper().count('C'))
        G = int(perseq.upper().count('G'))
        CG = int(perseq.upper().count('CG'))
        GC = int(perseq.upper().count('GC'))
        if GC < 1 or CG < 1 or C < 1 or G < 1:
            continue
        CpG_ratio = round((CG / 1000) / ((C / 1000) * (G / 1000)), 4)
        GpC_ratio = round((GC / 1000) / ((C / 1000) * (G / 1000)), 4)
        print(j,
              'random seq',
예제 #39
0
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
    r"""writes a bedgraph file containing the GC correction of
    a region from the genome

    >>> test = Tester()
    >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk())
    >>> open(tempFile, 'r').readlines()
    ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n']
    >>> os.remove(tempFile)
    """
    global R_gc
    fragmentLength = len(R_gc) - 1

    cvg_corr = np.zeros(end - start)

    i = 0

    tbit = py2bit.open(global_vars['2bit'])
    bam = pysam.Samfile(global_vars['bam'])
    read_repetitions = 0
    removed_duplicated_reads = 0
    startTime = time.time()

    # caching seems to be faster
    # r.flag & 4 == 0 is to skip unmapped
    # reads that nevertheless are asigned
    # to a genomic position
    reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0]

    bam.close()
    r_index = -1
    for read in reads:
        r_index += 1
        try:
            # calculate GC content of read fragment
            gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit)
        except Exception as detail:
            print(detail)
            """ this exception happens when the end of a
            chromosome is reached """
            continue
        if not gc:
            continue

        # is this read in the same orientation and position as the previous?
        if r_index > 0 and read.pos == reads[r_index - 1].pos and \
                read.is_reverse == reads[r_index - 1].is_reverse \
                and read.pnext == reads[r_index - 1].pnext:
            read_repetitions += 1
            if read_repetitions >= global_vars['max_dup_gc'][gc]:
                removed_duplicated_reads += 1
                continue
        else:
            read_repetitions = 0

        try:
            fragmentStart, fragmentEnd = getFragmentFromRead(
                read, fragmentLength, extendPairedEnds=True)
            vectorStart = max(fragmentStart - start, 0)
            vectorEnd = min(fragmentEnd - start, end - start)
        except TypeError:
            # the get_fragment_from_read functions returns None in some cases.
            # Those cases are to be skipped, hence the continue line.
            continue

        cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
        i += 1
    if debug:
        endTime = time.time()
        print("{}, processing {} ({:.1f} per sec) ")
        "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i,
                                  i / (endTime - startTime), chrNameBit, start,
                                  end)

    if i == 0:
        return None

    _file = open(utilities.getTempFileName(suffix='.bg'), 'w')
    # save in bedgraph format
    for bin in range(0, len(cvg_corr), step):
        value = np.mean(cvg_corr[bin:min(bin + step, end)])
        if value > 0:
            writeStart = start + bin
            writeEnd = min(start + bin + step, end)
            _file.write("%s\t%d\t%d\t%.1f\n" %
                        (chrNameBit, writeStart, writeEnd, value))

    tempFileName = _file.name
    _file.close()
    return tempFileName
예제 #40
0
파일: test.py 프로젝트: dpryan79/py2bit
 def testOpenClose(self):
     tb = py2bit.open(self.fname, True)
     assert(tb is not None)
     tb.close()
def propagateVariantsAndGetScores(snp_info_file, overlap_info_file, model_path,
                                  is_classifier, batch_size, genome_path, left,
                                  right):

    genome_object = py2bit.open(genome_path)
    model = load_model(model_path, compile=False)
    curr_batch_ref_sequences = []
    curr_batch_alt_sequences = []
    curr_batch_is_overlapping = []
    variants_processed_overall = 0
    variants_propagated_overall = 0
    batches_propagated_overall = 0

    with open(snp_info_file, 'r') as sf, open(overlap_info_file, 'r') as of:
        sf_header = sf.readline()
        for sf_line in sf:
            of_line = of.readline()
            of_data = of_line.strip().split("\t")
            sf_data = sf_line.strip().split("\t")
            chrom = of_data[0]
            snp_ref_start = int(of_data[1])
            snp_ref_end = int(of_data[2])
            ref = sf_data[4]
            alt = sf_data[5]
            is_overlapping = int(of_data[4])

            if is_overlapping:
                ref_sequence = getSequence(ref, chrom, snp_ref_start, left,
                                           right, genome_object)
                alt_sequence = getSequence(alt, chrom, snp_ref_start, left,
                                           right, genome_object)
                ref_sequence_encoded = oneHotEncodeSequence(ref_sequence)
                alt_sequence_encoded = oneHotEncodeSequence(alt_sequence)
                curr_batch_ref_sequences.append(ref_sequence_encoded)
                curr_batch_alt_sequences.append(alt_sequence_encoded)
                curr_batch_is_overlapping.append(True)
            else:
                curr_batch_is_overlapping.append(False)

            if len(curr_batch_ref_sequences) == batch_size:
                onBatchEnd(
                    model,
                    curr_batch_ref_sequences,
                    curr_batch_alt_sequences,
                    is_classifier,
                    curr_batch_is_overlapping,
                )
                variants_propagated_overall += batch_size
                batches_propagated_overall += 1

                curr_batch_ref_sequences = []
                curr_batch_alt_sequences = []
                curr_batch_is_overlapping = []
                logging.info("{0} Total variants propagated {1}".format(
                    time.asctime(), variants_propagated_overall))
                logging.info("{0} Total batches propagated {1}".format(
                    time.asctime(), batches_propagated_overall))
                logging.info("{0} Total variants processed {1}".format(
                    time.asctime(), variants_processed_overall))
            variants_processed_overall += 1

    #processing for incomplete batches at the end
    if curr_batch_is_overlapping:
        if curr_batch_ref_sequences:
            assert (len(curr_batch_ref_sequences) < batch_size)
            onBatchEnd(model, curr_batch_ref_sequences,
                       curr_batch_alt_sequences, is_classifier,
                       curr_batch_is_overlapping)
            variants_propagated_overall += len(curr_batch_ref_sequences)
            batches_propagated_overall += 1

        #remaining NA values need to be printed if last batch is full of non-overlapping regions
        else:
            for val in curr_batch_is_overlapping:
                assert (not val)
                print("\t".join(["nan", "nan", "nan"]))
        logging.info("{0} Total variants propagated {1}".format(
            time.asctime(), variants_propagated_overall))
        logging.info("{0} Total batches propagated {1}".format(
            time.asctime(), batches_propagated_overall))
        logging.info("{0} Total variants processed {1}".format(
            time.asctime(), variants_processed_overall))

    genome_object.close()