def _quantify_tfam(orf_set, gnds): """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in accordance with startmask and stopmask""" strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} orf_matrix = np.zeros((nnt, len(orf_set))) ignore_coords = [] for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)): orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1) ignore_coords.append(tid_indices[tid][max(tcoord+startmask[0], 0):tcoord+startmask[1]]) ignore_coords.append(tid_indices[tid][max(tstop+stopmask[0], 0):tstop+stopmask[1]]) ignore_coords = np.unique(np.concatenate(ignore_coords)) orf_matrix[ignore_coords, :] = 0 # mask out all positions within the mask region around starts and stops valid_orfs = np.array([(orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set))]) # require at least one valid position, and if >1 ORFs are identical, only include one of them orf_matrix[:, ~valid_orfs] = 0 # completely ignore these positions valid_nts = (orf_matrix > 0).any(1) # only bother checking nucleotides where there is a valid ORF orf_res = orf_set.copy() if valid_nts.any(): orf_matrix = orf_matrix[valid_nts, :] valid_nt_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos[valid_nts]))) orf_res['nts_quantified'] = (orf_matrix > 0).sum(0) # the number of nucleotides included in the quantification for colname, gnd in zip(colnames, gnds): orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0] # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array return orf_res else: orf_res['nts_quantified'] = 0 for colname in colnames: orf_res[colname] = 0. return orf_res
# gene_name_lookup = pd.read_csv(opts.genenames,sep='\t',header=None,names=['tid','tfam']).set_index('tid')['tfam'].to_dict() else: gene_name_lookup = {} new_tfams = {} multi_names = defaultdict(lambda: int(1)) for tfam_val in tfams.itervalues(): geneset = {gene_name_lookup[tid] for tid in tfam_val[0] if tid in gene_name_lookup} if not geneset: geneset = set(tfam_val[0]) # if no gene names available, just use the tids themselves genename = _choose_name(geneset) if genename in new_tfams: multi_names[genename] += 1 genename = '%s_%d' % (genename, multi_names[genename]) new_tfams[genename] = tfam_val for (genename, num_appearances) in multi_names.iteritems(): sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' % (genename, num_appearances)) if opts.verbose: logprint('Saving results') with open(outbedname, 'w') as outbed: with open(outtxtname, 'w') as outtxt: for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems(): outbed.write(SegmentChain(*positionlist_to_segments(chrom, strand, list(genpos)), ID=tfam).as_bed()) for tid in tids: outtxt.write('%s\t%s\n' % (tid, tfam)) if opts.verbose: logprint('Tasks complete')
def _quantify_tfam(orf_set, gnds): """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in accordance with startmask and stopmask""" strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = { tid: np.flatnonzero( np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems() } orf_matrix = np.zeros((nnt, len(orf_set))) ignore_coords = [] for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)): orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1) ignore_coords.append(tid_indices[tid][max(tcoord + startmask[0], 0):tcoord + startmask[1]]) ignore_coords.append( tid_indices[tid][max(tstop + stopmask[0], 0):tstop + stopmask[1]]) ignore_coords = np.unique(np.concatenate(ignore_coords)) orf_matrix[ ignore_coords, :] = 0 # mask out all positions within the mask region around starts and stops valid_orfs = np.array([ (orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set)) ]) # require at least one valid position, and if >1 ORFs are identical, only include one of them orf_matrix[:, ~valid_orfs] = 0 # completely ignore these positions valid_nts = (orf_matrix > 0).any( 1) # only bother checking nucleotides where there is a valid ORF orf_res = orf_set.copy() if valid_nts.any(): orf_matrix = orf_matrix[valid_nts, :] valid_nt_segs = SegmentChain(*positionlist_to_segments( chrom, strand, list(all_tfam_genpos[valid_nts]))) orf_res['nts_quantified'] = (orf_matrix > 0).sum( 0) # the number of nucleotides included in the quantification for colname, gnd in zip(colnames, gnds): orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0] # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array return orf_res else: orf_res['nts_quantified'] = 0 for colname in colnames: orf_res[colname] = 0. return orf_res
for tid in tfam_val[0] if tid in gene_name_lookup } if not geneset: geneset = set( tfam_val[0] ) # if no gene names available, just use the tids themselves genename = _choose_name(geneset) if genename in new_tfams: multi_names[genename] += 1 genename = '%s_%d' % (genename, multi_names[genename]) new_tfams[genename] = tfam_val for (genename, num_appearances) in multi_names.iteritems(): sys.stderr.write('WARNING: Gene name %s appears %d independent times\n' % (genename, num_appearances)) if opts.verbose: logprint('Saving results') with open(outbedname, 'w') as outbed: with open(outtxtname, 'w') as outtxt: for tfam, (tids, (chrom, strand), genpos) in new_tfams.iteritems(): outbed.write( SegmentChain(*positionlist_to_segments(chrom, strand, list(genpos)), ID=tfam).as_bed()) for tid in tids: outtxt.write('%s\t%s\n' % (tid, tfam)) if opts.verbose: logprint('Tasks complete')
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) if not include_starts.any(): return failure_return # no need to keep going if there weren't any useful starts gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: if not elongating_orfs.any(): return failure_return gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df