def __init__(self): import os self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.mappability = self.root + "mappability.bw" self.chrNameBam = '2L' self.chrNameBit = 'chr2L' bam = bamHandler.openBam(self.bamFile) tbit = py2bit.open(self.tbitFile) global debug debug = 0 global global_vars global_vars = {'2bit': self.tbitFile, 'bam': self.bamFile, 'filter_out': None, 'mappability': self.mappability, 'extra_sampling_file': None, 'max_reads': 5, 'min_reads': 0, 'min_reads': 0, 'reads_per_bp': 0.3, 'total_reads': bam.mapped, 'genome_size': sum(tbit.chroms().values()) }
def countReadsPerGC_worker(chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) c = 1 sub_reads_per_gc = [] positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if region extends over the chromosome end if tbit.chroms(chromNameBit) < i + regionSize: break try: gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize)) except Exception as detail: if verbose: print("{}:{}-{}".format(chromNameBit, i, i + regionSize)) print(detail) continue numberReads = bam.count(chromNameBam, i, i + regionSize) sub_reads_per_gc.append((numberReads, gc)) c += 1 return sub_reads_per_gc
def testHardMaskedBlocks(self): tb = py2bit.open(self.fname, True) assert (tb.hardMaskedBlocks("chr1") == [(0, 50), (100, 150)]) assert (tb.hardMaskedBlocks("chr1", 25, 75) == [(0, 50)]) assert (tb.hardMaskedBlocks("chr1", 75, 100) == []) assert (tb.hardMaskedBlocks("chr1", 75, 101) == [(100, 150)]) assert (tb.hardMaskedBlocks("chr2") == [(50, 100)]) tb.close()
def _method_py2bit(self, *args, **kwargs): import py2bit data = py2bit.open(self.infile) with open(self.outfile, "w") as fout: for chrom in sorted(data.chroms()): seq = data.sequence(chrom) fout.write(">{}\n{}\n".format(chrom, seq))
def testInfo(self): tb = py2bit.open(self.fname, True) correct = {'file size': 161, 'nChroms': 2, 'sequence length': 250, 'hard-masked length': 150, 'soft-masked length': 8} check = tb.info() assert(len(correct) == len(check)) for k, v in check.items(): assert(correct[k] == v) tb.close()
def testHardMaskedBlocks(self): tb = py2bit.open(self.fname, True) assert(tb.hardMaskedBlocks("chr1") == [(0, 50), (100, 150)]) assert(tb.hardMaskedBlocks("chr1", 25, 75) == [(0, 50)]) assert(tb.hardMaskedBlocks("chr1", 75, 100) == []) assert(tb.hardMaskedBlocks("chr1", 75, 101) == [(100, 150)]) assert(tb.hardMaskedBlocks("chr2") == [(50, 100)]) tb.close()
def testChroms(self): tb = py2bit.open(self.fname, True) chroms = tb.chroms() correct = {'chr1': 150, 'chr2': 100} for k, v in chroms.items(): assert (correct[k] == v) assert (tb.chroms("chr1") == 150) assert (tb.chroms("c") is None) tb.close()
def testChroms(self): tb = py2bit.open(self.fname, True) chroms = tb.chroms() correct = {'chr1': 150, 'chr2': 100} for k, v in chroms.items(): assert(correct[k] == v) assert(tb.chroms("chr1") == 150) assert(tb.chroms("c") is None) tb.close()
def _get_seq(self, chrom, start, stop): if self.in_mem: seq = self.twobit[chrom][start:stop] else: if self.thread_safe: twobit = py2bit.open(self.twobit) seq = np.array(list(twobit.sequence(chrom, start, stop))) twobit.close() else: seq = np.array(list(self.twobit.sequence(chrom, start, stop))) return seq
def testInfo(self): tb = py2bit.open(self.fname, True) correct = { 'file size': 161, 'nChroms': 2, 'sequence length': 250, 'hard-masked length': 150, 'soft-masked length': 8 } check = tb.info() assert (len(correct) == len(check)) for k, v in check.items(): assert (correct[k] == v) tb.close()
def testSequence(self): tb = py2bit.open(self.fname, True) assert ( tb.sequence("chr1") == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ) assert ( tb.sequence("chr1", 0, 1000) == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" ) assert (tb.sequence( "chr1", 24, 74) == "NNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATC") tb.close()
def __init__(self, twobit_file, alpha='dna', one_hot=True, channel_last=True, in_mem=False, thread_safe=False): super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe) self.twobit = py2bit.open(twobit_file) self._chroms = list(self.twobit.chroms().keys()) self._chroms_size = self.twobit.chroms() if in_mem: twobit_onehot_dict = self._encode_seqs(self.twobit) self.twobit.close() self.twobit = twobit_onehot_dict self.thread_safe = True else: if thread_safe: self.twobit.close() self.twobit = twobit_file
def test_indels(): tb = py2bit.open('tests/data/test.2bit') #tb = py2bit.open('/Users/sobe/Data/2bit/hg38.2bit') #seq ='CTCGCGCGTT' #for i in range(1000000, 400700100): # if i % 1000000 == 0: # print(i) # if seq == tb.sequence("chr1", i, i+len(seq)): # print(i, seq) #test insertion possible_pos = utils.get_possible_indel_pos("chr1:1012200-1012400", 106, "C", "CCT", tb, 1) assert (possible_pos == [(106, 'CT'), (107, 'TC'), (108, 'CT')]) kmers = [] for rpos, ralt in possible_pos: c1, c2 = utils.get_indel_contexts("chr1:1012200-1012400", rpos, 2, tb) kmers.append(c1) assert (kmers == ['ACCT', 'CCTT', 'CTTA']) #test deletion start possible_pos = utils.get_possible_indel_pos("chr1:40884100-40884200", 43, "TCG", "T", tb, 1) assert (possible_pos == [(43, 'CG'), (44, 'GC'), (45, 'CG'), (46, 'GC'), (47, 'CG')]) kmers = [] for rpos, ralt in possible_pos: c1, c2 = utils.get_indel_contexts("chr1:40884100-40884200", rpos, 2, tb) kmers.append(c1) assert (kmers == ['CTCG', 'TCGC', 'CGCG', 'GCGC', 'CGCG']) #test deltion end kmers = [] for rpos, ralt in possible_pos: c1, c2 = utils.get_indel_contexts("chr1:40884100-40884200", rpos + (len("TCG") - len("T")), 2, tb) kmers.append(c1) assert (kmers == ['CGCG', 'GCGC', 'CGCG', 'GCGT', 'CGTT']) kmers = []
def main(): parser = argparse.ArgumentParser(add_help=True, description="Bourbon finds contiguous regions without repeats (low peat content) of a minimum size and without genes within some distance. Output is written to the terminal. Note that this program currently ignores the ends of chromosomes.") parser.add_argument("rmsk", help="Repeat masker file") parser.add_argument("gtf", help="GTF file") parser.add_argument("tbit", help="2bit file") parser.add_argument("--minimumProof", type=int, default=15000, help="Minimum size of a repeat-free region (default %(default)s)") parser.add_argument("--wobble", type=int, default=5000, help="Ensure no genes are within this distance of a region of interest (default %(default)s)") parser.add_argument("--legalBAC", type=float, default=0.01, help="Maximum N content (default %(default)s)") args = parser.parse_args() # Produce a header print("Chromosome\tStart\tEnd") genes = GTF(args.gtf) rmsk = open(args.rmsk) tb = py2bit.open(args.tbit) lastChrom = None lastEnd = 0 for line in rmsk: if line.startswith("#"): continue cols = line.strip().split() chrom = cols[5] start = int(cols[6]) - 1 end = int(cols[7]) if chrom == lastChrom: if start - lastEnd >= args.minimumProof: ROIstart = lastEnd ROIend = start blocks = splitByGenes(chrom, ROIstart, ROIend, genes, args.wobble) for block in blocks: if block[1] - block[0] < args.minimumProof: continue if not highN(chrom, block[0], block[1], tb, args.legalBAC): print("{}\t{}\t{}".format(chrom, block[0], block[1])) lastChrom = chrom lastEnd = end rmsk.close() tb.close()
def testBases(self): tb = py2bit.open(self.fname, True) assert (tb.bases("chr1") == { 'A': 0.08, 'C': 0.08, 'T': 0.08666666666666667, 'G': 0.08666666666666667 }) assert (tb.bases("chr1", 24, 74) == { 'A': 0.12, 'C': 0.12, 'T': 0.12, 'G': 0.12 }) assert (tb.bases("chr1", 24, 74, False) == { 'A': 6, 'C': 6, 'T': 6, 'G': 6 }) tb.close()
def openInputFiles(ctrlbwFile, expbwFile, tbFile, mappableFile): global CTRLBW global EXPBW global TB global CTRLBW_SUM global EXPBW_SUM global CTRLBW_NUM global EXPBW_NUM global fileDir global MAPPABLE MAPPABLE = mappableFile dirname = ctrlbwFile[0] + "_" + expbwFile[0] fileDir = "/data/reddylab/YoungSook/kmer/kmer_script/modeling/modeling_onebp_DNA_RNA_ver4/" + dirname CTRLBW_NUM = len(ctrlbwFile) EXPBW_NUM = len(expbwFile) CTRLBW = [0] * CTRLBW_NUM os.chdir("/data/reddylab/YoungSook/kmer/kmer_input/etoh_deep/EtOH_rep") for i in range(CTRLBW_NUM): CTRLBW[i] = pyBigWig.open(ctrlbwFile[i]) EXPBW = [0] * EXPBW_NUM os.chdir("/data/reddylab/YoungSook/kmer/kmer_input/etoh_deep/EtOH_rep/onebp_model/rna") for i in range(EXPBW_NUM): EXPBW[i] = pyBigWig.open(expbwFile[i]) ## CALCULATE CTRLBW_SUM CTRLBW_SUM = float(CTRLBW[0].header().get('sumData')) #EXPBW_SUM = float(EXPBW.header().get('sumData')) os.chdir("/data/reddylab/YoungSook/ref_genome/hg38") TB = py2bit.open(tbFile)
def testOpenClose(self): tb = py2bit.open(self.fname, True) assert (tb is not None) tb.close()
def testSoftMaskedBlocks(self): tb = py2bit.open(self.fname, storeMasked=True) assert (tb.softMaskedBlocks("chr1") == [(62, 70)]) assert (tb.softMaskedBlocks("chr1", 0, 50) == []) tb.close()
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False): r""" given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() >>> args = test.testTabulateGCcontentWorker() >>> N_gc, F_gc = tabulateGCcontent_worker(*args) The forward read positions are: [1, 4, 10, 10, 16, 18] which correspond to a GC of [1, 1, 1, 1, 2, 1] The evaluated position are [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] the corresponding GC is [2, 1, 1, 2, 2, 1, 2, 3, 2, 1] >>> print(N_gc) [0 4 5 1] >>> print(F_gc) [0 4 1 0] >>> test.set_filter_out_file() >>> chrNameBam2bit = {'2L': 'chr2L'} Test for the filter out option >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) >>> test.unset_filter_out_file() The evaluated positions are [ 0 2 8 10 12 14 16 18] >>> print(N_gc) [0 3 4 1] >>> print(F_gc) [0 3 1 0] Test for extra_sampling option >>> test.set_extra_sampling_file() >>> chrNameBam2bit = {'2L': 'chr2L'} >>> res = tabulateGCcontent_worker('2L', 0, 20, 2, ... {'median': 3}, chrNameBam2bit) The new positions evaluated are [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18] and the GC is [2, 1, 1, 0, 1, 2, 2, 1, 2, 3, 2, 1] >>> print(res[0]) [1 5 5 1] >>> print(res[1]) [0 5 1 0] """ if start > end: raise NameError("start %d bigger that end %d" % (start, end)) chromNameBit = chrNameBamToBit[chromNameBam] # array to keep track of the GC from regions of length 'fragmentLength' # from the genome. The index of the array is used to # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) peak = 0 startTime = time.time() if verbose: print("[{:.3f}] computing positions to " "sample".format(time.time() - startTime)) positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. # if the sample regions are far apart from each # other is faster to go to each location and fetch # the reads found there. # Otherwise, if the regions to sample are close to # each other, is faster to load all the reads in # a large region into memory and consider only # those falling into the positions to sample. # The following code gets the reads # that are at sampling positions that lie close together if np.mean(np.diff(positions_to_sample)) < 1000: start_pos = min(positions_to_sample) end_pos = max(positions_to_sample) if verbose: print("[{:.3f}] caching reads".format(time.time() - startTime)) counts = np.bincount([r.pos - start_pos for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) if not r.is_reverse and r.pos >= start_pos], minlength=end_pos - start_pos + 2) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: print("[{:.3f}] finish caching reads.".format( time.time() - startTime)) countTime = time.time() c = 1 for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached if i + fragmentLength['median'] > tbit.chroms(chromNameBit): break try: gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False) except Exception as detail: if verbose: print(detail) continue subN_gc[gc] += 1 # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1) if x.is_reverse is False and x.pos == i]) else: num_reads = read_counts[index] if num_reads >= global_vars['max_reads']: peak += 1 continue subF_gc[gc] += num_reads if verbose: if index % 50000 == 0: endTime = time.time() print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize)) c += 1 if verbose: endTime = time.time() print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % (multiprocessing.current_process().name, index, index / (endTime - countTime), chromNameBit, start, end, stepSize)) print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name, (endTime - startTime), chromNameBit, start, end, stepSize)) return(subN_gc, subF_gc)
def main(args=None): args = parse_arguments().parse_args(args) if args.extraSampling: extra_sampling_file = args.extraSampling.name args.extraSampling.close() else: extra_sampling_file = None global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile global_vars['filter_out'] = args.blackListFileName global_vars['extra_sampling_file'] = extra_sampling_file tbit = py2bit.open(global_vars['2bit']) bam = bamHandler.openBam(global_vars['bam']) if args.fragmentLength: fragment_len_dict = \ {'median': args.fragmentLength} else: fragment_len_dict, __ = \ get_read_and_fragment_length(args.bamfile, None, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose) if not fragment_len_dict: print("\nPlease provide the fragment length used for the " "sample preparation.\n") exit(1) fragment_len_dict = {'median': int(fragment_len_dict['median'])} chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] # use poisson distribution to identify peaks that should be discarted. # I multiply by 4, because the real distribution of reads # vary depending on the gc content # and the global number of reads per bp may a be too low. # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. global_vars['max_reads'] = \ poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 global_vars['min_reads'] = \ poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) for key in global_vars: print("{}: {}".format(key, global_vars[key])) print("computing frequencies") # the GC of the genome is sampled each stepSize bp. stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) print("stepSize: {}".format(stepSize)) data = tabulateGCcontent(fragment_len_dict, chrNameBitToBam, stepSize, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: reads_per_gc = countReadsPerGC(args.regionSize, chrNameBitToBam, stepSize * 10, chromSizes, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, region=args.region) plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
# print Headings print("Chromosome", "Feature", "Start", "End", "Strand", "CpG_ratio", "Gpc_ratio", end='\t', file=fout1) # "ApA_ratio","ApT_ratio","ApG_ratio","ApC_ratio","TpA_ratio","TpT_ratio","TpG_ratio","TpC_ratio","GpA_ratio","GpT_ratio","GpG_ratio","GpC_ratio","CpA_ratio","CpT_ratio","CpG_ratio","CpC_ratio" bit_file = sys.argv[1] gff_file = sys.argv[2] bd = py2bit.open(bit_file, True) def extract_exons(): with open(gff_file) as fh: for line in fh: if line.startswith('#'): continue content = line.strip().split('\t') if chr_name == '' or chr_name != content[0]: chr_name = content[0] chr_seq_length = len() chrom = content[0] feature = content[2] start = int(content[3])
def testSequence(self): tb = py2bit.open(self.fname, True) assert (tb.sequence("chr1", 1, 3) == "NN") assert (tb.sequence("chr1", 1, 2) == "N") tb.close()
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> try: ... import StringIO ... except ImportError: ... from io import StringIO >>> ostdout = sys.stdout >>> import tempfile >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.bam') outfile = pysam.Samfile(tempFileName, 'wb', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [ r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0 ] r_index = -1 for read in reads: r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del (matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname # Each tag is a tuple of (tag name, value, type) # Note that get_tags() returns ord(type) rather than type and this must # be fixed! # It turns out that the "with_value_type" option only started working in # pysam-0.8.4, so we can't reliably add tags on earlier versions without # potentially creating BAM files that break HTSJDK/IGV/etc. readTag = read.get_tags(with_value_type=True) replace_tags = False if len(readTag) > 0: if len(readTag[0]) == 3: if type(readTag[2]) is int: readTag = [(x[0], x[1], chr(x[2])) for x in readTag] replace_tags = True else: replace_tags = True if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append(('YC', float(round(float(1) / R_gc[gc], 2)), "f")) readTag.append(('YN', copies, "i")) else: GC = -1 readTag.append(('YG', GC, "i")) if replace_tags: read.set_tags(readTag) if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format( multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) i += 1 outfile.close() if verbose: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print("duplicated reads removed %d of %d (%.2f) " % (removed_duplicated_reads, len(reads), percentage)) return tempFileName
def testSoftMaskedBlocks(self): tb = py2bit.open(self.fname, storeMasked=True) assert(tb.softMaskedBlocks("chr1") == [(62, 70)]) assert(tb.softMaskedBlocks("chr1", 0, 50) == []) tb.close()
def testSequence(self): tb = py2bit.open(self.fname, True) assert(tb.sequence("chr1", 1, 3) == "NN") assert(tb.sequence("chr1", 1, 2) == "N") tb.close()
def testSequence(self): tb = py2bit.open(self.fname, True) assert(tb.sequence("chr1") == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") assert(tb.sequence("chr1", 0, 1000) == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") assert(tb.sequence("chr1", 24, 74) == "NNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATC") tb.close()
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r"""writes a bedgraph file containing the GC correction of a region from the genome >>> test = Tester() >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk()) >>> open(tempFile, 'r').readlines() ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() # caching seems to be faster # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() r_index = -1 for read in reads: if read.is_unmapped: continue r_index += 1 try: # calculate GC content of read fragment gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a chromosome is reached """ continue if not gc: continue # is this read in the same orientation and position as the previous? if r_index > 0 and read.pos == reads[r_index - 1].pos and \ read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc] i += 1 try: if debug: endTime = time.time() print("{}, processing {} ({:.1f} per sec) ") "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) except NameError: pass if i == 0: return None _file = open(utilities.getTempFileName(suffix='.bg'), 'w') # save in bedgraph format for bin in range(0, len(cvg_corr), step): value = np.mean(cvg_corr[bin:min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value)) tempFileName = _file.name _file.close() return tempFileName
def main(args=None): """ Run the main program. This function is executed when you type `kmer_counter` or `python -m kmer_counter`. Arguments: args: Arguments passed from the command line. Returns: An exit code. """ parser = argparse.ArgumentParser(prog='kmer_counter', description=''' Count k-mers at SNVs, indel breakpoints or in genomic regions ''') subparsers = parser.add_subparsers( dest='command', help='command. Must choose what kind of file you want to count.') parser.add_argument('-V', '--version', action='store_true') snv_parser = subparsers.add_parser('snv', description='''Count k-mers at SNVs. The k-mer will be centered around the mutation base and if the reference base is G or T then the reverse_complement of the reference k-mer will be counted instead. ''') snv_parser.add_argument('ref_genome', help='Reference genome in 2bit format', type=str) snv_parser.add_argument( 'mutations', type=argparse.FileType('r'), help= 'A vcf-like file with SNVs. First four columns should be: Chrom, pos, ref, alt. ' 'Other columns are ignored. Non-SNV variants are ignored.') snv_parser.add_argument( '-r', '--radius', type=int, metavar='R', default=1, help='Count the k-mer from R bases before a position to R bases ' 'after a position. For each position in the inputfile.') indel_parser = subparsers.add_parser('indel', description='Count k-mers at indels.') indel_parser.add_argument('ref_genome', help='Reference genome in 2bit format', type=str) indel_parser.add_argument( 'mutations', type=argparse.FileType('r'), help= 'A sorted vcf-like file with indels. First four columns should be: Chrom, pos, ref, alt. ' 'Other columns are ignored. Non-indel variants are ignored. Indels should be left-aligned.' ) indel_parser.add_argument( 'type', choices=['ins', 'del_start', 'del_end', 'all', 'del'], help='What type of indel breakpoint do you want to count?') indel_parser.add_argument( '-r', '--radius', default=1, type=int, help= 'How many base pairs before indel_start_point or after indel_end_point should be included ' 'as context annotation.') indel_parser.add_argument( '--sample', action="store_true", help= 'Randomly choose one of the possible positions instead of counting the expected (non integer) count for each possible position of an ambigously aligned indel.' ) #indel_parser.add_argument('-v', '--verbose', action='store_true') bg_parser = subparsers.add_parser( 'background', description='Count kmers in (regions of) a genome') bg_parser.add_argument( 'ref_genome', type=str, help='Reference genome in 2bit format', ) bg_parser.add_argument( '--bed', type=str, help= 'bed-file describing regions that should be counted. May be gzipped.') #bg_parser.add_argument('--wig', type=str, # help='wig-file describing regions that should be counted. May be gzipped. ' # 'The context at a position will be weigthed by the value from the ' # 'wig-file at that position. The output counts will thus be floats ' # 'and not integers') bg_parser.add_argument('--all_autosomes', action="store_true", help='All parts of the autosomes will be counted') bg_parser.add_argument( '-r', '--radius', type=int, metavar='R', help='Count the k-mer from R bases before a position to R bases ' 'after a position. For each position in the inputfile.') bg_parser.add_argument( '--before_after', type=int, nargs=2, metavar=('X', 'Y'), help= 'count the k-mer from X bases before a position to Y bases after a position. ' 'For each position in the inputfile.') bg_parser.add_argument( '--reverse_complement_method', type=str, choices=['none', 'middle', 'lexicographic', 'both'], help= '"none" means that alle k-mers are counted unchanged. "middle" means that the reverse complement of a k-mer is counted if the middle position is not a "A" or "C". "lexicographic" means that the reverse_complement is counted if it has a smaller lexicographic order. Default is "middle" if --radius option is used and "lexicographic" if --before_after is used.' ) args = parser.parse_args(args) if args.version: from kmer_counter import __version__ print("version:", __version__) print() return 0 if args.command not in ['snv', 'indel', 'background']: print('Error: must specify command.') print() parser.print_help() return 0 if 'ref_genome' not in args: print('Error: ref_genome (as 2bit file) must be specified') print() parser.print_help() return 0 tb = py2bit.open(args.ref_genome) if args.command == 'indel': kmer_count = count_indels(args.mutations, tb, args.type, args.radius, args.sample) elif args.command == 'snv': dreader = PosReader(args.mutations, tb) kmer_count = count_non_indels(tb, dreader, args.radius, args.radius, 'middle') elif args.command == 'background': if args.radius is None == args.before_after is None: raise Exception( 'Either the --radius or the --before_after option should be used (not both).' ) if not args.radius is None: assert args.radius > 0 before = args.radius after = args.radius if args.reverse_complement_method is None: args.reverse_complement_method = "middle" else: before, after = args.before_after assert before >= 0 assert after >= 0 if args.reverse_complement_method is None: args.reverse_complement_method = "none" if not args.bed is None: if args.all_autosomes: raise Exception( 'Either --bed or--all_autosomes option should be used. Not both.' ) if args.bed.endswith('.bed.gz'): dreader = BedReader(gzip.open(args.bed, 'rt')) elif args.bed.endswith('.bed'): dreader = BedReader(open(args.bed)) elif args.bed == '-': dreader = BedReader(sys.stdin) else: raise Exception('bed file should end with ".bed" or ".bed.gz"') elif args.all_autosomes: dreader = AllAutoReader(tb) else: raise Exception( 'Either --bed or--all_autosomes option should be used') kmer_count = count_non_indels(tb, dreader, before, after, args.reverse_complement_method) for x in kmer_count: print(x, kmer_count[x]) return 0
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [ binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc)) ] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = bam.mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append( (chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): _temp_bg_file_name = utilities.getTempFileName(suffix='_all.bg') if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) # concatenate intermediary bedgraph files _temp_bg_file = open(_temp_bg_file_name, 'w') for tempFileName in res: if tempFileName: # concatenate all intermediate tempfiles into one # bedgraph file shutil.copyfileobj(open(tempFileName, 'rb'), _temp_bg_file) os.remove(tempFileName) _temp_bg_file.close() args.correctedFile.close() if args.correctedFile.name.endswith('bg'): shutil.move(_temp_bg_file_name, args.correctedFile.name) else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, _temp_bg_file_name, args.correctedFile.name) os.remove(_temp_bg_file)
def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, step=None, tag_but_not_change_number=False, verbose=True): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** >>> np.random.seed(1) >>> test = Tester() >>> args = test.testWriteCorrectedSam() >>> tempFile = writeCorrectedSam_worker(*args, \ ... tag_but_not_change_number=True, verbose=False) >>> try: ... import StringIO ... except ImportError: ... from io import StringIO >>> ostdout = sys.stdout >>> import tempfile >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch(args[0], 200, 250)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") >>> tempFile = \ ... writeCorrectedSam_worker(*test.testWriteCorrectedSam_paired(),\ ... tag_but_not_change_number=True, verbose=False) >>> sys.stdout = tempfile.TemporaryFile() >>> idx = pysam.index(tempFile) >>> sys.stdout = ostdout >>> bam = pysam.Samfile(tempFile) >>> [dict(r.tags)['YN'] for r in bam.fetch('chr2L', 0, 50)] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] >>> res = os.remove(tempFile) >>> res = os.remove(tempFile+".bai") """ global R_gc fragmentLength = len(R_gc) - 1 if verbose: print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = openBam(global_vars['bam']) tempFileName = utilities.getTempFileName(suffix='.bam') outfile = pysam.Samfile(tempFileName, 'wb', template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 removed_duplicated_reads = 0 # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.pos > start and r.flag & 4 == 0] r_index = -1 for read in reads: if read.pos <= start or read.is_unmapped: continue r_index += 1 copies = None gc = None # check if a mate has already been procesed # to apply the same correction try: copies = matePairs[read.qname]['copies'] gc = matePairs[read.qname]['gc'] del(matePairs[read.qname]) except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ and read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: read_repetitions = 0 readName = read.qname # Each tag is a tuple of (tag name, value, type) # Note that get_tags() returns ord(type) rather than type and this must # be fixed! # It turns out that the "with_value_type" option only started working in # pysam-0.8.4, so we can't reliably add tags on earlier versions without # potentially creating BAM files that break HTSJDK/IGV/etc. readTag = read.get_tags(with_value_type=True) replace_tags = False if len(readTag) > 0: if len(readTag[0]) == 3: if type(readTag[2]) is int: readTag = [(x[0], x[1], chr(x[2])) for x in readTag] replace_tags = True else: replace_tags = True if gc: GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) readTag.append( ('YC', float(round(float(1) / R_gc[gc], 2)), "f")) readTag.append(('YN', copies, "i")) else: GC = -1 readTag.append(('YG', GC, "i")) if replace_tags: read.set_tags(readTag) if read.is_paired and read.is_proper_pair \ and not read.mate_is_unmapped \ and not read.is_reverse: matePairs[readName] = {'copies': copies, 'gc': gc} """ outfile.write(read) """ if tag_but_not_change_number: outfile.write(read) continue for numCop in range(1, copies + 1): # the read has to be renamed such that newly # formed pairs will match if numCop > 1: read.qname = readName + "_%d" % (numCop) outfile.write(read) if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) i += 1 outfile.close() if verbose: endTime = time.time() print("{}, processing {} ({:.1f} per sec) reads " "@ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end)) percentage = float(removed_duplicated_reads) * 100 / len(reads) \ if len(reads) > 0 else 0 print("duplicated reads removed %d of %d (%.2f) " % (removed_duplicated_reads, len(reads), percentage)) return tempFileName
# 1. takes bed file and 2bit file as input # 2. based on bed file locations extract the underlying sequence and write into a fasta format # 3. fasta output with name from bed file (column 4) if not provided use location as name import subprocess import py2bit # pip install git+https://github.com/dpryan79/py2bit import os tb_input = "/data/repository/organisms/dm6_ensembl/genome_fasta/genome.2bit" tb = py2bit.open(tb_input) print "2bit file: %s" % tb_input print "name of bed file:" bed_input = raw_input("> ") bed_input = os.path.abspath(bed_input) bed = open(bed_input, 'r') print "name of the output file:" output_name = raw_input("> ") output_name = output_name + ".fa" #check whether there is a column 4 in bed file for names line = bed.next().strip() colnum = len(line.split("\t")) if colnum >= 4: namecol = True else: namecol = False bed.close()
def testBases(self): tb = py2bit.open(self.fname, True) assert(tb.bases("chr1") == {'A': 0.08, 'C': 0.08, 'T': 0.08666666666666667, 'G': 0.08666666666666667}) assert(tb.bases("chr1", 24, 74) == {'A': 0.12, 'C': 0.12, 'T': 0.12, 'G': 0.12}) assert(tb.bases("chr1", 24, 74, False) == {'A': 6, 'C': 6, 'T': 6, 'G': 6}) tb.close()
def main(args=None): args = process_args(args) global F_gc, N_gc, R_gc data = np.loadtxt(args.GCbiasFrequenciesFile.name) F_gc = data[:, 0] N_gc = data[:, 1] R_gc = data[:, 2] global global_vars global_vars = {} global_vars['2bit'] = args.genome global_vars['bam'] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 for x in range(len(F_gc))] global_vars['max_dup_gc'] = max_dup_gc tbit = py2bit.open(global_vars['2bit']) bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) global_vars['genome_size'] = sum(tbit.chroms().values()) global_vars['total_reads'] = mapped global_vars['reads_per_bp'] = \ float(global_vars['total_reads']) / args.effectiveGenomeSize # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) chunkSize = int(4e5 / global_vars['reads_per_bp']) # chromSizes: list of tuples chromSizes = [(bam.references[i], bam.lengths[i]) for i in range(len(bam.references))] regionStart = 0 if args.region: chromSizes, regionStart, regionEnd, chunkSize = \ mapReduce.getUserRegion(chromSizes, args.region, max_chunk_size=chunkSize) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) mp_args = [] bedGraphStep = args.binSize chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) print(chrNameBitToBam, chrNameBamToBit) c = 1 for chrom, size in chromSizes: start = 0 if regionStart == 0 else regionStart for i in range(start, size, chunkSize): try: chrNameBamToBit[chrom] except KeyError: print("no sequence information for ") "chromosome {} in 2bit file".format(chrom) print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) if args.correctedFile.name.endswith('bam'): if len(mp_args) > 1 and args.numberOfProcessors > 1: print(("using {} processors for {} " "number of tasks".format(args.numberOfProcessors, len(mp_args)))) res = pool.map_async( writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) if len(res) == 1: command = "cp {} {}".format(res[0], args.correctedFile.name) run_shell_command(command) else: print("concatenating (sorted) intermediate BAMs") header = pysam.Samfile(res[0]) of = pysam.Samfile(args.correctedFile.name, "wb", template=header) header.close() for f in res: f = pysam.Samfile(f) for e in f.fetch(until_eof=True): of.write(e) f.close() of.close() print("indexing BAM") pysam.index(args.correctedFile.name) for tempFileName in res: os.remove(tempFileName) if args.correctedFile.name.endswith('bg') or \ args.correctedFile.name.endswith('bw'): if len(mp_args) > 1 and args.numberOfProcessors > 1: res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() if oname.endswith('bg'): f = open(oname, 'wb') for tempFileName in res: if tempFileName: shutil.copyfileobj(open(tempFileName, 'rb'), f) os.remove(tempFileName) f.close() else: chromSizes = [(k, v) for k, v in tbit.chroms().items()] writeBedGraph.bedGraphToBigWig(chromSizes, res, oname)
import random import py2bit import sys import os print('Chrom', 'Feature', 'start', 'end', 'CpG_ratio', 'GpC_ratio', sep='\t') bit_file = sys.argv[1] tb = py2bit.open('bit_file', True) tb_dict = tb.chroms() chrom = list(tb_dict.keys()) #chrom_names len_chrom = list(tb_dict.values()) #chrom_lens count_entry = 0 while count_entry < 10000: j = random.choice(chrom) random_start = random.randrange(1, tb_dict[j]) random_end = random_start + 1000 if random_end > int(tb_dict[j]): continue else: perseq = tb.sequence(j, random_start, random_end) C = int(perseq.upper().count('C')) G = int(perseq.upper().count('G')) CG = int(perseq.upper().count('CG')) GC = int(perseq.upper().count('GC')) if GC < 1 or CG < 1 or C < 1 or G < 1: continue CpG_ratio = round((CG / 1000) / ((C / 1000) * (G / 1000)), 4) GpC_ratio = round((GC / 1000) / ((C / 1000) * (G / 1000)), 4) print(j, 'random seq',
def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r"""writes a bedgraph file containing the GC correction of a region from the genome >>> test = Tester() >>> tempFile = writeCorrected_worker(*test.testWriteCorrectedChunk()) >>> open(tempFile, 'r').readlines() ['chr2L\t200\t225\t31.6\n', 'chr2L\t225\t250\t33.8\n', 'chr2L\t250\t275\t37.9\n', 'chr2L\t275\t300\t40.9\n'] >>> os.remove(tempFile) """ global R_gc fragmentLength = len(R_gc) - 1 cvg_corr = np.zeros(end - start) i = 0 tbit = py2bit.open(global_vars['2bit']) bam = pysam.Samfile(global_vars['bam']) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() # caching seems to be faster # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() r_index = -1 for read in reads: r_index += 1 try: # calculate GC content of read fragment gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a chromosome is reached """ continue if not gc: continue # is this read in the same orientation and position as the previous? if r_index > 0 and read.pos == reads[r_index - 1].pos and \ read.is_reverse == reads[r_index - 1].is_reverse \ and read.pnext == reads[r_index - 1].pnext: read_repetitions += 1 if read_repetitions >= global_vars['max_dup_gc'][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: fragmentStart, fragmentEnd = getFragmentFromRead( read, fragmentLength, extendPairedEnds=True) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc] i += 1 if debug: endTime = time.time() print("{}, processing {} ({:.1f} per sec) ") "reads @ {}:{}-{}".format(multiprocessing.current_process().name, i, i / (endTime - startTime), chrNameBit, start, end) if i == 0: return None _file = open(utilities.getTempFileName(suffix='.bg'), 'w') # save in bedgraph format for bin in range(0, len(cvg_corr), step): value = np.mean(cvg_corr[bin:min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value)) tempFileName = _file.name _file.close() return tempFileName
def testOpenClose(self): tb = py2bit.open(self.fname, True) assert(tb is not None) tb.close()
def propagateVariantsAndGetScores(snp_info_file, overlap_info_file, model_path, is_classifier, batch_size, genome_path, left, right): genome_object = py2bit.open(genome_path) model = load_model(model_path, compile=False) curr_batch_ref_sequences = [] curr_batch_alt_sequences = [] curr_batch_is_overlapping = [] variants_processed_overall = 0 variants_propagated_overall = 0 batches_propagated_overall = 0 with open(snp_info_file, 'r') as sf, open(overlap_info_file, 'r') as of: sf_header = sf.readline() for sf_line in sf: of_line = of.readline() of_data = of_line.strip().split("\t") sf_data = sf_line.strip().split("\t") chrom = of_data[0] snp_ref_start = int(of_data[1]) snp_ref_end = int(of_data[2]) ref = sf_data[4] alt = sf_data[5] is_overlapping = int(of_data[4]) if is_overlapping: ref_sequence = getSequence(ref, chrom, snp_ref_start, left, right, genome_object) alt_sequence = getSequence(alt, chrom, snp_ref_start, left, right, genome_object) ref_sequence_encoded = oneHotEncodeSequence(ref_sequence) alt_sequence_encoded = oneHotEncodeSequence(alt_sequence) curr_batch_ref_sequences.append(ref_sequence_encoded) curr_batch_alt_sequences.append(alt_sequence_encoded) curr_batch_is_overlapping.append(True) else: curr_batch_is_overlapping.append(False) if len(curr_batch_ref_sequences) == batch_size: onBatchEnd( model, curr_batch_ref_sequences, curr_batch_alt_sequences, is_classifier, curr_batch_is_overlapping, ) variants_propagated_overall += batch_size batches_propagated_overall += 1 curr_batch_ref_sequences = [] curr_batch_alt_sequences = [] curr_batch_is_overlapping = [] logging.info("{0} Total variants propagated {1}".format( time.asctime(), variants_propagated_overall)) logging.info("{0} Total batches propagated {1}".format( time.asctime(), batches_propagated_overall)) logging.info("{0} Total variants processed {1}".format( time.asctime(), variants_processed_overall)) variants_processed_overall += 1 #processing for incomplete batches at the end if curr_batch_is_overlapping: if curr_batch_ref_sequences: assert (len(curr_batch_ref_sequences) < batch_size) onBatchEnd(model, curr_batch_ref_sequences, curr_batch_alt_sequences, is_classifier, curr_batch_is_overlapping) variants_propagated_overall += len(curr_batch_ref_sequences) batches_propagated_overall += 1 #remaining NA values need to be printed if last batch is full of non-overlapping regions else: for val in curr_batch_is_overlapping: assert (not val) print("\t".join(["nan", "nan", "nan"])) logging.info("{0} Total variants propagated {1}".format( time.asctime(), variants_propagated_overall)) logging.info("{0} Total batches propagated {1}".format( time.asctime(), batches_propagated_overall)) logging.info("{0} Total variants processed {1}".format( time.asctime(), variants_processed_overall)) genome_object.close()