def _get_tid_info(tup): """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer, identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store.""" (chrom, strand) = tup inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite)) # map to roughly the center of each read so that identical sequences that cross different splice sites # (on different transcripts) still end up mapping to the same place gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen)) tid_seq_info = [] tid_summary = pd.DataFrame( {'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': ''}, index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid')) for (tid, line) in bedlinedict[(chrom, strand)].iteritems(): currtrans = SegmentChain.from_bed(line) curr_pos_list = currtrans.get_position_list() # not in stranded order! if strand == '-': curr_pos_list = curr_pos_list[::-1] n_psite = len(curr_pos_list) + 1 - fpsize tid_summary.at[tid, 'n_psite'] = n_psite if n_psite > 0: curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite] # if((curr_counts>0).any()): sumcounts = curr_counts.sum() maxcounts = curr_counts.max() tid_summary.at[tid, 'n_reads'] = sumcounts tid_summary.at[tid, 'peak_reads'] = maxcounts if sumcounts >= opts.minreads: if maxcounts < sumcounts * opts.peakfrac: numseq = np.array(list(currtrans.get_sequence(genome).upper().translate(str_dict))) curr_seq = ''.join(numseq) tid_seq_info.append(pd.DataFrame({'tid': tid, 'genpos': curr_pos_list[psite:n_psite + psite], 'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1) for i in xrange(n_psite)], dtype=np.int64), 'reads': curr_counts})) else: tid_summary.at[tid, 'dropped'] = 'peakfrac' else: tid_summary.at[tid, 'dropped'] = 'lowreads' if tid_seq_info: # don't bother saving anything if there's nothing to save pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t', data_columns=True, complevel=1, complib='blosc') # sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)]) # repack for efficiency # os.remove(orig_store_name) if opts.verbose > 1: with log_lock: logprint('%s (%s strand) complete' % (chrom, strand)) for inbam in inbams: inbam.close() return tid_summary
def getCountVectorData(bedfile, bamList, bamIDs): # read BED file to iterator of SegmentChain objects bed_segmentChains = list(BED_Reader(open(bedfile))) # create list of tuples (SegmentChain, [count_vectors/BAM]) countVectorData = [] # iterate through ROIs for index, segment in enumerate(bed_segmentChains): # prints in terminal progress every 100 ROIs processed if index % 100 == 0: #printer.write("Processed %s regions of interest" % index) print("Processed %s regions of interest" % index) # create generator of BAMGenomeArray objects print(bamList) BAM_al = (BAMGenomeArray(bamfile) for bamfile in bamList) # list of count vectors per BAM count_vec = [segment.get_counts(al) for al in BAM_al] # create recorded array, columns are count vectors # MUST call by BAM name, NOT ind count_array = np.core.records.fromarrays(count_vec, names=bamIDs) # add tuple to countVectorData list countVectorData.append((segment, count_array)) #printer.write("Completed processing data.") print("Completed processing data.") return countVectorData
def __init__(self,bamfiles,mapping): """Create HashedReadBAMGenomeArray Parameters ---------- bamfile : list An list of open :py:class:`pysam.AlignmentFile` s. Note: the corresponding `BAM`_ files must be sorted and indexed by `samtools`_. mapping : func Mapping function that determines how each read alignment is mapped to a count at a genomic position. Returns a list of pysam.AlignedReads and a dict of count vectors corresponding to the number of mapped read counts at each position, keyed according to a supplied function. Must have a list of valid keys stored as mapping.read_keys. Typically generated using ReadKeyMapFactory(). """ BAMGenomeArray.__init__(self,bamfiles,mapping)
def __init__(self, bamfiles, mapping): """Create HashedReadBAMGenomeArray Parameters ---------- bamfile : list An list of open :py:class:`pysam.AlignmentFile` s. Note: the corresponding `BAM`_ files must be sorted and indexed by `samtools`_. mapping : func Mapping function that determines how each read alignment is mapped to a count at a genomic position. Returns a list of pysam.AlignedReads and a dict of count vectors corresponding to the number of mapped read counts at each position, keyed according to a supplied function. Must have a list of valid keys stored as mapping.read_keys. Typically generated using ReadKeyMapFactory(). """ BAMGenomeArray.__init__(self, bamfiles, mapping=mapping)
def test_variable_stratified_mapping_plus(self): offsets = { 26 : 6, 27 : 22, 28 : 13, 29 : 4, 30 : 5 } chains = { "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'), GenomicSegment('chrII',393510,394742,'+'), GenomicSegment('chrII',394860,394901,'+'), ID='YBR078W_mRNA'), "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'), GenomicSegment('chrVIII',189850,190017,'-'), ID='YHR041C_mRNA') } expected = { "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"), "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"), } ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")]) ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
def _get_tid_info(tup): """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer, identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store.""" (chrom, strand) = tup inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite)) # map to roughly the center of each read so that identical sequences that cross different splice sites # (on different transcripts) still end up mapping to the same place gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen)) tid_seq_info = [] tid_summary = pd.DataFrame( { 'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': '' }, index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid')) for (tid, line) in bedlinedict[(chrom, strand)].iteritems(): currtrans = SegmentChain.from_bed(line) curr_pos_list = currtrans.get_position_list() # not in stranded order! if strand == '-': curr_pos_list = curr_pos_list[::-1] n_psite = len(curr_pos_list) + 1 - fpsize tid_summary.at[tid, 'n_psite'] = n_psite if n_psite > 0: curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite] # if((curr_counts>0).any()): sumcounts = curr_counts.sum() maxcounts = curr_counts.max() tid_summary.at[tid, 'n_reads'] = sumcounts tid_summary.at[tid, 'peak_reads'] = maxcounts if sumcounts >= opts.minreads: if maxcounts < sumcounts * opts.peakfrac: numseq = np.array( list( currtrans.get_sequence(genome).upper().translate( str_dict))) curr_seq = ''.join(numseq) tid_seq_info.append( pd.DataFrame({ 'tid': tid, 'genpos': curr_pos_list[psite:n_psite + psite], 'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1) for i in xrange(n_psite)], dtype=np.int64), 'reads': curr_counts })) else: tid_summary.at[tid, 'dropped'] = 'peakfrac' else: tid_summary.at[tid, 'dropped'] = 'lowreads' if tid_seq_info: # don't bother saving anything if there's nothing to save pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t', data_columns=True, complevel=1, complib='blosc') # sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)]) # repack for efficiency # os.remove(orig_store_name) if opts.verbose > 1: with log_lock: logprint('%s (%s strand) complete' % (chrom, strand)) for inbam in inbams: inbam.close() return tid_summary