def _get_annotated_counts_by_chrom(chrom_to_do): """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene.""" found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d" % (chrom_to_do, -startnt[0], min_AAlen), columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \ .sort_values('AAlen', ascending=False).drop_duplicates('tfam') # use the longest annotated CDS in each transcript family num_cds_incl = 0 # number of CDSs included from this chromosome startprof = np.zeros((len(rdlens), startlen)) cdsprof = np.zeros((len(rdlens), 3)) stopprof = np.zeros((len(rdlens), stoplen)) inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis)) for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False): curr_trans = SegmentChain.from_bed(bedlinedict[tid]) tlen = curr_trans.get_length() if tlen >= tstop + stopnt[1]: # need to guarantee that the 3' UTR is sufficiently long curr_hashed_counts = get_hashed_counts(curr_trans, gnd) cdslen = tstop+stopnt[1]-tcoord-startnt[0] # cds length, plus the extra bases... curr_counts = np.zeros((len(rdlens), cdslen)) for (i, rdlen) in enumerate(rdlens): for nmis in range(opts.max5mis+1): curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]] # curr_counts is limited to the CDS plus any extra requested nucleotides on either side if curr_counts.sum() >= opts.mincdsreads: curr_counts /= curr_counts.mean() # normalize by mean of counts across all readlengths and positions within the CDS startprof += curr_counts[:, :startlen] cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1) stopprof += curr_counts[:, cdslen-stoplen:cdslen] num_cds_incl += 1 for inbam in inbams: inbam.close() return startprof, cdsprof, stopprof, num_cds_incl
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) if not include_starts.any(): return failure_return # no need to keep going if there weren't any useful starts gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: if not elongating_orfs.any(): return failure_return gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df