Exemplo n.º 1
0
def _get_tid_info(tup):
    """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer,
    identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store."""
    (chrom, strand) = tup
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite))
    # map to roughly the center of each read so that identical sequences that cross different splice sites
    # (on different transcripts) still end up mapping to the same place
    gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen))

    tid_seq_info = []
    tid_summary = pd.DataFrame(
        {'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': ''},
        index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid'))
    for (tid, line) in bedlinedict[(chrom, strand)].iteritems():
        currtrans = SegmentChain.from_bed(line)
        curr_pos_list = currtrans.get_position_list()  # not in stranded order!
        if strand == '-':
            curr_pos_list = curr_pos_list[::-1]
        n_psite = len(curr_pos_list) + 1 - fpsize
        tid_summary.at[tid, 'n_psite'] = n_psite
        if n_psite > 0:
            curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite]
            #                if((curr_counts>0).any()):
            sumcounts = curr_counts.sum()
            maxcounts = curr_counts.max()
            tid_summary.at[tid, 'n_reads'] = sumcounts
            tid_summary.at[tid, 'peak_reads'] = maxcounts
            if sumcounts >= opts.minreads:
                if maxcounts < sumcounts * opts.peakfrac:
                    numseq = np.array(list(currtrans.get_sequence(genome).upper().translate(str_dict)))
                    curr_seq = ''.join(numseq)
                    tid_seq_info.append(pd.DataFrame({'tid': tid,
                                                      'genpos': curr_pos_list[psite:n_psite + psite],
                                                      'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1)
                                                                       for i in xrange(n_psite)], dtype=np.int64),
                                                      'reads': curr_counts}))
                else:
                    tid_summary.at[tid, 'dropped'] = 'peakfrac'
            else:
                tid_summary.at[tid, 'dropped'] = 'lowreads'
    if tid_seq_info:  # don't bother saving anything if there's nothing to save
        pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t',
                                                          data_columns=True, complevel=1, complib='blosc')
    #    sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)])  # repack for efficiency
    #    os.remove(orig_store_name)
    if opts.verbose > 1:
        with log_lock:
            logprint('%s (%s strand) complete' % (chrom, strand))

    for inbam in inbams:
        inbam.close()

    return tid_summary
Exemplo n.º 2
0
def getCountVectorData(bedfile, bamList, bamIDs):

    # read BED file to iterator of SegmentChain objects
    bed_segmentChains = list(BED_Reader(open(bedfile)))

    # create list of tuples (SegmentChain, [count_vectors/BAM])
    countVectorData = []

    # iterate through ROIs
    for index, segment in enumerate(bed_segmentChains):

        # prints in terminal progress every 100 ROIs processed
        if index % 100 == 0:
            #printer.write("Processed %s regions of interest" % index)
            print("Processed %s regions of interest" % index)

        # create generator of BAMGenomeArray objects
        print(bamList)
        BAM_al = (BAMGenomeArray(bamfile) for bamfile in bamList)

        # list of count vectors per BAM
        count_vec = [segment.get_counts(al) for al in BAM_al]

        # create recorded array, columns are count vectors
        # MUST call by BAM name, NOT ind
        count_array = np.core.records.fromarrays(count_vec, names=bamIDs)

        # add tuple to countVectorData list
        countVectorData.append((segment, count_array))

    #printer.write("Completed processing data.")
    print("Completed processing data.")
    return countVectorData
Exemplo n.º 3
0
    def __init__(self,bamfiles,mapping):
        """Create HashedReadBAMGenomeArray

        Parameters
        ----------
        bamfile : list
            An list of open :py:class:`pysam.AlignmentFile` s. Note: the
            corresponding `BAM`_ files must be sorted and indexed by `samtools`_.

        mapping : func
            Mapping function that determines how each read alignment is mapped to a
            count at a genomic position. Returns a list of pysam.AlignedReads and a
            dict of count vectors corresponding to the number of mapped read counts
            at each position, keyed according to a supplied function. Must have a
            list of valid keys stored as mapping.read_keys.
            Typically generated using ReadKeyMapFactory().
        """
        BAMGenomeArray.__init__(self,bamfiles,mapping)
Exemplo n.º 4
0
    def __init__(self, bamfiles, mapping):
        """Create HashedReadBAMGenomeArray

        Parameters
        ----------
        bamfile : list
            An list of open :py:class:`pysam.AlignmentFile` s. Note: the
            corresponding `BAM`_ files must be sorted and indexed by `samtools`_.

        mapping : func
            Mapping function that determines how each read alignment is mapped to a
            count at a genomic position. Returns a list of pysam.AlignedReads and a
            dict of count vectors corresponding to the number of mapped read counts
            at each position, keyed according to a supplied function. Must have a
            list of valid keys stored as mapping.read_keys.
            Typically generated using ReadKeyMapFactory().
        """
        BAMGenomeArray.__init__(self, bamfiles, mapping=mapping)
Exemplo n.º 5
0
    def test_variable_stratified_mapping_plus(self):
        offsets = {
            26 : 6,
            27 : 22,
            28 : 13,
            29 : 4,
            30 : 5
        }

        chains = {
            "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'),
                              GenomicSegment('chrII',393510,394742,'+'),
                              GenomicSegment('chrII',394860,394901,'+'),
                              ID='YBR078W_mRNA'),
            "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'),
                              GenomicSegment('chrVIII',189850,190017,'-'),
                              ID='YHR041C_mRNA')
        }
        expected = {
            "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"),
            "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"),
        }
        ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")])
        ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
Exemplo n.º 6
0
def _get_tid_info(tup):
    """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer,
    identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store."""
    (chrom, strand) = tup
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite))
    # map to roughly the center of each read so that identical sequences that cross different splice sites
    # (on different transcripts) still end up mapping to the same place
    gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen))

    tid_seq_info = []
    tid_summary = pd.DataFrame(
        {
            'chrom': chrom,
            'strand': strand,
            'n_psite': -1,
            'n_reads': -1,
            'peak_reads': -1,
            'dropped': ''
        },
        index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid'))
    for (tid, line) in bedlinedict[(chrom, strand)].iteritems():
        currtrans = SegmentChain.from_bed(line)
        curr_pos_list = currtrans.get_position_list()  # not in stranded order!
        if strand == '-':
            curr_pos_list = curr_pos_list[::-1]
        n_psite = len(curr_pos_list) + 1 - fpsize
        tid_summary.at[tid, 'n_psite'] = n_psite
        if n_psite > 0:
            curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite +
                                                              psite]
            #                if((curr_counts>0).any()):
            sumcounts = curr_counts.sum()
            maxcounts = curr_counts.max()
            tid_summary.at[tid, 'n_reads'] = sumcounts
            tid_summary.at[tid, 'peak_reads'] = maxcounts
            if sumcounts >= opts.minreads:
                if maxcounts < sumcounts * opts.peakfrac:
                    numseq = np.array(
                        list(
                            currtrans.get_sequence(genome).upper().translate(
                                str_dict)))
                    curr_seq = ''.join(numseq)
                    tid_seq_info.append(
                        pd.DataFrame({
                            'tid':
                            tid,
                            'genpos':
                            curr_pos_list[psite:n_psite + psite],
                            'seq':
                            np.array([(int(curr_seq[i:i + fpsize], 4) if 'N'
                                       not in curr_seq[i:i + fpsize] else -1)
                                      for i in xrange(n_psite)],
                                     dtype=np.int64),
                            'reads':
                            curr_counts
                        }))
                else:
                    tid_summary.at[tid, 'dropped'] = 'peakfrac'
            else:
                tid_summary.at[tid, 'dropped'] = 'lowreads'
    if tid_seq_info:  # don't bother saving anything if there's nothing to save
        pd.concat(tid_seq_info,
                  ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand),
                                            'tid_seq_info',
                                            format='t',
                                            data_columns=True,
                                            complevel=1,
                                            complib='blosc')
    #    sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)])  # repack for efficiency
    #    os.remove(orig_store_name)
    if opts.verbose > 1:
        with log_lock:
            logprint('%s (%s strand) complete' % (chrom, strand))

    for inbam in inbams:
        inbam.close()

    return tid_summary