Exemplo n.º 1
0
def getCountVectorData(bedfile, bamList, bamIDs):

    # read BED file to iterator of SegmentChain objects
    bed_segmentChains = list(BED_Reader(open(bedfile)))

    # create list of tuples (SegmentChain, [count_vectors/BAM])
    countVectorData = []

    # iterate through ROIs
    for index, segment in enumerate(bed_segmentChains):

        # prints in terminal progress every 100 ROIs processed
        if index % 100 == 0:
            #printer.write("Processed %s regions of interest" % index)
            print("Processed %s regions of interest" % index)

        # create generator of BAMGenomeArray objects
        print(bamList)
        BAM_al = (BAMGenomeArray(bamfile) for bamfile in bamList)

        # list of count vectors per BAM
        count_vec = [segment.get_counts(al) for al in BAM_al]

        # create recorded array, columns are count vectors
        # MUST call by BAM name, NOT ind
        count_array = np.core.records.fromarrays(count_vec, names=bamIDs)

        # add tuple to countVectorData list
        countVectorData.append((segment, count_array))

    #printer.write("Completed processing data.")
    print("Completed processing data.")
    return countVectorData
Exemplo n.º 2
0
    def test_variable_stratified_mapping_plus(self):
        offsets = {
            26 : 6,
            27 : 22,
            28 : 13,
            29 : 4,
            30 : 5
        }

        chains = {
            "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'),
                              GenomicSegment('chrII',393510,394742,'+'),
                              GenomicSegment('chrII',394860,394901,'+'),
                              ID='YBR078W_mRNA'),
            "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'),
                              GenomicSegment('chrVIII',189850,190017,'-'),
                              ID='YHR041C_mRNA')
        }
        expected = {
            "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"),
            "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"),
        }
        ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")])
        ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
Exemplo n.º 3
0
def _get_tid_info(tup):
    """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer,
    identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store."""
    (chrom, strand) = tup
    inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
    gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite))
    # map to roughly the center of each read so that identical sequences that cross different splice sites
    # (on different transcripts) still end up mapping to the same place
    gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen))

    tid_seq_info = []
    tid_summary = pd.DataFrame(
        {
            'chrom': chrom,
            'strand': strand,
            'n_psite': -1,
            'n_reads': -1,
            'peak_reads': -1,
            'dropped': ''
        },
        index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid'))
    for (tid, line) in bedlinedict[(chrom, strand)].iteritems():
        currtrans = SegmentChain.from_bed(line)
        curr_pos_list = currtrans.get_position_list()  # not in stranded order!
        if strand == '-':
            curr_pos_list = curr_pos_list[::-1]
        n_psite = len(curr_pos_list) + 1 - fpsize
        tid_summary.at[tid, 'n_psite'] = n_psite
        if n_psite > 0:
            curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite +
                                                              psite]
            #                if((curr_counts>0).any()):
            sumcounts = curr_counts.sum()
            maxcounts = curr_counts.max()
            tid_summary.at[tid, 'n_reads'] = sumcounts
            tid_summary.at[tid, 'peak_reads'] = maxcounts
            if sumcounts >= opts.minreads:
                if maxcounts < sumcounts * opts.peakfrac:
                    numseq = np.array(
                        list(
                            currtrans.get_sequence(genome).upper().translate(
                                str_dict)))
                    curr_seq = ''.join(numseq)
                    tid_seq_info.append(
                        pd.DataFrame({
                            'tid':
                            tid,
                            'genpos':
                            curr_pos_list[psite:n_psite + psite],
                            'seq':
                            np.array([(int(curr_seq[i:i + fpsize], 4) if 'N'
                                       not in curr_seq[i:i + fpsize] else -1)
                                      for i in xrange(n_psite)],
                                     dtype=np.int64),
                            'reads':
                            curr_counts
                        }))
                else:
                    tid_summary.at[tid, 'dropped'] = 'peakfrac'
            else:
                tid_summary.at[tid, 'dropped'] = 'lowreads'
    if tid_seq_info:  # don't bother saving anything if there's nothing to save
        pd.concat(tid_seq_info,
                  ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand),
                                            'tid_seq_info',
                                            format='t',
                                            data_columns=True,
                                            complevel=1,
                                            complib='blosc')
    #    sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)])  # repack for efficiency
    #    os.remove(orig_store_name)
    if opts.verbose > 1:
        with log_lock:
            logprint('%s (%s strand) complete' % (chrom, strand))

    for inbam in inbams:
        inbam.close()

    return tid_summary