def _get_tid_info(tup): """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer, identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store.""" (chrom, strand) = tup inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite)) # map to roughly the center of each read so that identical sequences that cross different splice sites # (on different transcripts) still end up mapping to the same place gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen)) tid_seq_info = [] tid_summary = pd.DataFrame( {'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': ''}, index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid')) for (tid, line) in bedlinedict[(chrom, strand)].iteritems(): currtrans = SegmentChain.from_bed(line) curr_pos_list = currtrans.get_position_list() # not in stranded order! if strand == '-': curr_pos_list = curr_pos_list[::-1] n_psite = len(curr_pos_list) + 1 - fpsize tid_summary.at[tid, 'n_psite'] = n_psite if n_psite > 0: curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite] # if((curr_counts>0).any()): sumcounts = curr_counts.sum() maxcounts = curr_counts.max() tid_summary.at[tid, 'n_reads'] = sumcounts tid_summary.at[tid, 'peak_reads'] = maxcounts if sumcounts >= opts.minreads: if maxcounts < sumcounts * opts.peakfrac: numseq = np.array(list(currtrans.get_sequence(genome).upper().translate(str_dict))) curr_seq = ''.join(numseq) tid_seq_info.append(pd.DataFrame({'tid': tid, 'genpos': curr_pos_list[psite:n_psite + psite], 'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1) for i in xrange(n_psite)], dtype=np.int64), 'reads': curr_counts})) else: tid_summary.at[tid, 'dropped'] = 'peakfrac' else: tid_summary.at[tid, 'dropped'] = 'lowreads' if tid_seq_info: # don't bother saving anything if there's nothing to save pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t', data_columns=True, complevel=1, complib='blosc') # sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)]) # repack for efficiency # os.remove(orig_store_name) if opts.verbose > 1: with log_lock: logprint('%s (%s strand) complete' % (chrom, strand)) for inbam in inbams: inbam.close() return tid_summary
def _quantify_tfam(orf_set, gnds): """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in accordance with startmask and stopmask""" strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} orf_matrix = np.zeros((nnt, len(orf_set))) ignore_coords = [] for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)): orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1) ignore_coords.append(tid_indices[tid][max(tcoord+startmask[0], 0):tcoord+startmask[1]]) ignore_coords.append(tid_indices[tid][max(tstop+stopmask[0], 0):tstop+stopmask[1]]) ignore_coords = np.unique(np.concatenate(ignore_coords)) orf_matrix[ignore_coords, :] = 0 # mask out all positions within the mask region around starts and stops valid_orfs = np.array([(orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set))]) # require at least one valid position, and if >1 ORFs are identical, only include one of them orf_matrix[:, ~valid_orfs] = 0 # completely ignore these positions valid_nts = (orf_matrix > 0).any(1) # only bother checking nucleotides where there is a valid ORF orf_res = orf_set.copy() if valid_nts.any(): orf_matrix = orf_matrix[valid_nts, :] valid_nt_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos[valid_nts]))) orf_res['nts_quantified'] = (orf_matrix > 0).sum(0) # the number of nucleotides included in the quantification for colname, gnd in zip(colnames, gnds): orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0] # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array return orf_res else: orf_res['nts_quantified'] = 0 for colname in colnames: orf_res[colname] = 0. return orf_res
def _get_annotated_counts_by_chrom(chrom_to_do): """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene.""" found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d" % (chrom_to_do, -startnt[0], min_AAlen), columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \ .sort_values('AAlen', ascending=False).drop_duplicates('tfam') # use the longest annotated CDS in each transcript family num_cds_incl = 0 # number of CDSs included from this chromosome startprof = np.zeros((len(rdlens), startlen)) cdsprof = np.zeros((len(rdlens), 3)) stopprof = np.zeros((len(rdlens), stoplen)) inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis)) for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False): curr_trans = SegmentChain.from_bed(bedlinedict[tid]) tlen = curr_trans.get_length() if tlen >= tstop + stopnt[1]: # need to guarantee that the 3' UTR is sufficiently long curr_hashed_counts = get_hashed_counts(curr_trans, gnd) cdslen = tstop+stopnt[1]-tcoord-startnt[0] # cds length, plus the extra bases... curr_counts = np.zeros((len(rdlens), cdslen)) for (i, rdlen) in enumerate(rdlens): for nmis in range(opts.max5mis+1): curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]] # curr_counts is limited to the CDS plus any extra requested nucleotides on either side if curr_counts.sum() >= opts.mincdsreads: curr_counts /= curr_counts.mean() # normalize by mean of counts across all readlengths and positions within the CDS startprof += curr_counts[:, :startlen] cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1) stopprof += curr_counts[:, cdslen-stoplen:cdslen] num_cds_incl += 1 for inbam in inbams: inbam.close() return startprof, cdsprof, stopprof, num_cds_incl
def _get_annotated_counts_by_chrom(chrom_to_do): """Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene.""" found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d" % (chrom_to_do, -startnt[0], min_AAlen), columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \ .sort_values('AAlen', ascending=False).drop_duplicates('tfam') # use the longest annotated CDS in each transcript family num_cds_incl = 0 # number of CDSs included from this chromosome startprof = np.zeros((len(rdlens), startlen)) cdsprof = np.zeros((len(rdlens), 3)) stopprof = np.zeros((len(rdlens), stoplen)) inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis)) for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False): curr_trans = SegmentChain.from_bed(bedlinedict[tid]) tlen = curr_trans.get_length() if tlen >= tstop + stopnt[1]: # need to guarantee that the 3' UTR is sufficiently long curr_hashed_counts = get_hashed_counts(curr_trans, gnd) cdslen = tstop+stopnt[1]-tcoord-startnt[0] # cds length, plus the extra bases... curr_counts = np.zeros((len(rdlens), cdslen)) for (i, rdlen) in enumerate(rdlens): for nmis in range(opts.max5mis+1): curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]] # curr_counts is limited to the CDS plus any extra requested nucleotides on either side if curr_counts.sum() >= opts.mincdsreads: curr_counts /= curr_counts.mean() # normalize by mean of counts across all readlengths and positions within the CDS startprof += curr_counts[:, :startlen] cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1) stopprof += curr_counts[:, cdslen-stoplen:cdslen] num_cds_incl += 1 for inbam in inbams: inbam.close() return startprof, cdsprof, stopprof, num_cds_incl
def _quantify_tfam(orf_set, gnds): """Performs non-negative least squares regression to quantify all of the ORFs in a transcript family, using a simplified profile consisting of the same three numbers tiled across each ORF. All readlengths are treated identically. Regions around start and stop codons are masked in accordance with startmask and stopmask""" strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = { tid: np.flatnonzero( np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems() } orf_matrix = np.zeros((nnt, len(orf_set))) ignore_coords = [] for (orf_num, (tid, tcoord, tstop, AAlen)) in enumerate(orf_set[['tid', 'tcoord', 'tstop', 'AAlen']].itertuples(False)): orf_matrix[tid_indices[tid][tcoord:tstop], orf_num] = np.tile(cdsprof, AAlen + 1) ignore_coords.append(tid_indices[tid][max(tcoord + startmask[0], 0):tcoord + startmask[1]]) ignore_coords.append( tid_indices[tid][max(tstop + stopmask[0], 0):tstop + stopmask[1]]) ignore_coords = np.unique(np.concatenate(ignore_coords)) orf_matrix[ ignore_coords, :] = 0 # mask out all positions within the mask region around starts and stops valid_orfs = np.array([ (orf_matrix[:, i] > 0).any() and (orf_matrix.T[i, :] != orf_matrix.T[:i, :]).any(1).all() for i in xrange(len(orf_set)) ]) # require at least one valid position, and if >1 ORFs are identical, only include one of them orf_matrix[:, ~valid_orfs] = 0 # completely ignore these positions valid_nts = (orf_matrix > 0).any( 1) # only bother checking nucleotides where there is a valid ORF orf_res = orf_set.copy() if valid_nts.any(): orf_matrix = orf_matrix[valid_nts, :] valid_nt_segs = SegmentChain(*positionlist_to_segments( chrom, strand, list(all_tfam_genpos[valid_nts]))) orf_res['nts_quantified'] = (orf_matrix > 0).sum( 0) # the number of nucleotides included in the quantification for colname, gnd in zip(colnames, gnds): orf_res[colname] = nnls(orf_matrix, valid_nt_segs.get_counts(gnd))[0] # gnd is a HashedReadBAMGenomeArray, but it still works with get_counts(), which will collapse all read lengths to a single array return orf_res else: orf_res['nts_quantified'] = 0 for colname in colnames: orf_res[colname] = 0. return orf_res
def _identify_tfam_orfs((tfam, tids)): """Identify all of the possible ORFs within a family of transcripts. Relevant information such as genomic start and stop positions, amino acid length, and initiation codon will be collected for each ORF. Additionally, each ORF will be assigned a unique 'orfname', such that if it occurs on multiple transcripts, it can be recognized as the same ORF.""" currtfam = SegmentChain.from_bed(tfambedlines[tfam]) chrom = currtfam.chrom strand = currtfam.strand tfam_genpos = np.array(currtfam.get_position_list()) if strand == '-': tfam_genpos = tfam_genpos[::-1] tmask = np.empty((len(tids), len(tfam_genpos)), dtype=np.bool) # True if transcript covers that position, False if not tfam_orfs = [] tidx_lookup = {} for tidx, tid in enumerate(tids): tidx_lookup[tid] = tidx curr_trans = Transcript.from_bed(bedlinedict[tid]) tmask[tidx, :] = np.in1d(tfam_genpos, curr_trans.get_position_list(), assume_unique=True) trans_orfs = _find_all_orfs(curr_trans.get_sequence(genome).upper()) if trans_orfs: (startpos, stoppos, codons) = zip(*trans_orfs) startpos = np.array(startpos, dtype='i4') stoppos = np.array(stoppos, dtype='i4') gcoords = np.array([curr_trans.get_genomic_coordinate(x)[1] for x in startpos], dtype='i4') stop_present = (stoppos > 0) gstops = np.zeros(len(trans_orfs), dtype='i4') gstops[stop_present] = \ np.array([curr_trans.get_genomic_coordinate(x - 1)[1] for x in stoppos[stop_present]]) + (1 if strand == '+' else -1) # the decrementing/incrementing stuff preserves half-openness regardless of strand AAlens = np.zeros(len(trans_orfs), dtype='i4') AAlens[stop_present] = (stoppos[stop_present] - startpos[stop_present])/3 - 1 tfam_orfs.append(pd.DataFrame.from_items([('tfam', tfam), ('tid', tid), ('tcoord', startpos), ('tstop', stoppos), ('chrom', chrom), ('gcoord', gcoords), ('gstop', gstops), ('strand', strand), ('codon', codons), ('AAlen', AAlens), ('orfname', '')])) if any(x is not None for x in tfam_orfs): orf_pos_dict = {} tfam_orfs = pd.concat(tfam_orfs, ignore_index=True) for ((gcoord, AAlen), gcoord_grp) in tfam_orfs.groupby(['gcoord', 'AAlen']): # group by genomic start position and length if len(gcoord_grp) == 1: tfam_orfs.loc[gcoord_grp.index, 'orfname'] = _name_orf(tfam, gcoord, AAlen) else: orf_gcoords = np.vstack(np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop] for (tid, tcoord, tstop) in gcoord_grp[['tid', 'tcoord', 'tstop']].itertuples(False)) if (orf_gcoords == orf_gcoords[0, :]).all(): # all of the grouped ORFs are identical, so should receive the same name orfname = _name_orf(tfam, gcoord, AAlen) tfam_orfs.loc[gcoord_grp.index, 'orfname'] = orfname orf_pos_dict[orfname] = tfam_genpos[orf_gcoords[0, :]] else: named_so_far = 0 unnamed = np.ones(len(gcoord_grp), dtype=np.bool) basename = _name_orf(tfam, gcoord, AAlen) while unnamed.any(): next_gcoords = orf_gcoords[unnamed, :][0, :] identicals = (orf_gcoords == next_gcoords).all(1) orfname = '%s_%d' % (basename, named_so_far) tfam_orfs.loc[gcoord_grp.index[identicals], 'orfname'] = orfname orf_pos_dict[orfname] = tfam_genpos[next_gcoords] unnamed[identicals] = False named_so_far += 1 # Now that the ORFs have been found and named, figure out their orftype tfam_orfs['annot_start'] = False tfam_orfs['annot_stop'] = False # start out assuming all are False; replace with True as needed tfam_orfs['orftype'] = 'new' tfam_orfs['untyped'] = tfam_orfs['tstop'] > 0 tfam_orfs.loc[~tfam_orfs['untyped'], 'orftype'] = 'nonstop' # no stop codon if tfam in tfams_with_annots: cds_info = [] all_annot_pos = set() for (annot_fidx, (annot_tfam_lookup, annot_tid_lookup)) in enumerate(zip(annot_tfam_lookups, annot_tid_lookups)): if tfam in annot_tfam_lookup: for (annot_tidx, annot_tid) in enumerate(annot_tfam_lookup[tfam]): curr_trans = Transcript.from_bed(annot_tid_lookup[annot_tid]) if curr_trans.cds_start is not None and curr_trans.cds_end is not None: curr_cds_pos_set = curr_trans.get_cds().get_position_set() curr_len = len(curr_cds_pos_set) if curr_len % 3 == 0: curr_gcoord = curr_trans.get_genomic_coordinate(curr_trans.cds_start)[1] curr_gstop = curr_trans.get_genomic_coordinate(curr_trans.cds_end - 1)[1] + (1 if strand == '+' else -1) in_tfam = curr_cds_pos_set.issubset(tfam_genpos) cds_info.append((curr_gcoord, curr_gstop, (curr_len-3)/3, in_tfam, annot_fidx, annot_tid, curr_cds_pos_set)) all_annot_pos.update(curr_cds_pos_set) if cds_info: # False means no annotated CDSs or none are multiples of 3 in length cds_info = pd.DataFrame(cds_info, columns=['gcoord', 'gstop', 'AAlen', 'in_tfam', 'annot_fidx', 'annot_tid', 'pos']) \ .groupby(['gcoord', 'gstop', 'AAlen', 'in_tfam'], as_index=False) \ .apply(lambda x: x if len(x) == 1 else x[[not any(pos == x['pos'].iat[j] for j in xrange(i)) for (i, pos) in enumerate(x['pos'])]]) \ .set_index(['annot_fidx', 'annot_tid']) # this operation organizes cds_info into a dataframe and effectively drops duplicates # pandas drop_duplicates() is incompatible with sets so have to do it this manual way # the combination of annot_fidx (the number of the file if more than one annotation file provided) and annot_tid should be a unique ID tfam_orfs['annot_start'] = tfam_orfs['gcoord'].isin(cds_info['gcoord']) tfam_orfs['annot_stop'] = tfam_orfs['gstop'].isin(cds_info['gstop']) def _get_orf_pos(orfname, tid=None, tcoord=None, tstop=None): """Helper function that identifies the genomic coordinates of an ORF (in stranded order) and caches them by orfname""" if orfname in orf_pos_dict: return orf_pos_dict[orfname] else: if tid is None or tcoord is None or tstop is None: (tid, tcoord, tstop) = tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['tid', 'tcoord', 'tstop']].iloc[0] res = tfam_genpos[np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop]] orf_pos_dict[orfname] = res return res # ANNOTATED and XISO cds_info['found'] = False possible_annot = tfam_orfs.drop_duplicates('orfname').merge(cds_info[cds_info['in_tfam']].reset_index()) # merges on gcoord, gstop, and len - need to reset_index to preserve annot_fidx and annot_tid for ((orfname, tid, tcoord, tstop), cds_grp) in possible_annot.groupby(['orfname', 'tid', 'tcoord', 'tstop']): orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop) for (annot_fidx, annot_tid, cds_pos_set) in cds_grp[['annot_fidx', 'annot_tid', 'pos']].itertuples(False): if cds_pos_set.issubset(orf_pos): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['annotated', False] cds_info.loc[(annot_fidx, annot_tid), 'found'] = True break else: tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['Xiso', False] # matching start and stop but differing in between if tfam_orfs['untyped'].any(): tfam_orfs.loc[tfam_orfs['orfname'].isin(tfam_orfs[tfam_orfs['untyped']].merge(cds_info[['gcoord', 'gstop']])['orfname']), ['orftype', 'untyped']] = ['Xiso', False] # matching start and stop, but must differ somewhere, otherwise would have been identified as annotated (Xiso => "exact isoform") # SISO tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['annot_stop'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Siso', False] # start and stop each match at least one CDS, but not the same one (Siso => "spliced isoform") # CISO tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Ciso', False] # start is annotated, but stop is not - so must be on a new transcript (Ciso => "C-terminal isoform") # TRUNCATION if tfam_orfs['untyped'].any(): found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on=['tid', 'tstop'], suffixes=('', '_annot')) tfam_orfs.loc[tfam_orfs['orfname'].isin(found_matched_stop.loc[found_matched_stop['tcoord'] > found_matched_stop['tcoord_annot'], 'orfname']), ['orftype', 'untyped']] = ['truncation', False] # on the same transcript with an annotated CDS, with matching stop codon, initiating downstream - must be a truncation # still some missing truncations, if the original CDS was not on a transcript in the present transcriptome if tfam_orfs['untyped'].any() and not cds_info['found'].all(): possible_truncs = tfam_orfs[tfam_orfs['untyped']].drop_duplicates('orfname') \ .merge(cds_info.loc[~cds_info['found'], ['gstop', 'pos', 'AAlen']], on='gstop', suffixes=('', '_annot')) possible_truncs = possible_truncs[possible_truncs['AAlen'] < possible_truncs['AAlen_annot']] for ((orfname, tid, tcoord, tstop, gcoord), cds_pos_sets) in \ possible_truncs.groupby(['orfname', 'tid', 'tcoord', 'tstop', 'gcoord'])['pos']: orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop) if strand == '-': if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if pos <= gcoord) for cds_pos_set in cds_pos_sets): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['truncation', False] else: if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if pos >= gcoord) for cds_pos_set in cds_pos_sets): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['truncation', False] # matching stop codon, contained within, and all positions in the annotation past the orf start codon are included in the orf # EXTENSION if tfam_orfs['untyped'].any(): found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on=['tid', 'tstop'], suffixes=('', '_annot')) assert (found_matched_stop['tcoord'] < found_matched_stop['tcoord_annot']).all() # other possibilities should be done by now tfam_orfs.loc[tfam_orfs['orfname'].isin(found_matched_stop['orfname']), ['orftype', 'untyped']] = ['extension', False] # on the same transcript with an annotated CDS, with matching stop codon, initiating upstream - must be an extension # no possibility for an "unfound" extension - if the extension is in the transcriptome, the CDS it comes from must be as well # (except for a few edge cases e.g. annotated CDS is a CUG initiator, but not considering CUG ORFs) # NISO tfam_orfs.loc[tfam_orfs['annot_stop'] & (tfam_orfs['untyped']), ['orftype', 'untyped']] = ['Niso', False] # stop is annotated, but start is not, and it's not a truncation or extension - so must be an isoform (Niso => "N-terminal isoform") # NCISO if tfam_orfs['untyped'].any(): orf_codons = [] for (orfname, tid, tcoord, tstop) in \ tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'tid', 'tcoord', 'tstop']].drop_duplicates('orfname').itertuples(False): orf_codons.append(pd.DataFrame(_get_orf_pos(orfname, tid, tcoord, tstop).reshape((-1, 3)))) orf_codons[-1]['orfname'] = orfname orf_codons = pd.concat(orf_codons, ignore_index=True) if strand == '-': annot_codons = pd.DataFrame(np.vstack([np.reshape(sorted(cds_pos_set, reverse=True), (-1, 3)) for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0])).drop_duplicates() else: annot_codons = pd.DataFrame(np.vstack([np.reshape(sorted(cds_pos_set, reverse=False), (-1, 3)) for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0])).drop_duplicates() tfam_orfs.loc[tfam_orfs['orfname'].isin(orf_codons.merge(annot_codons)['orfname']), ['orftype', 'untyped']] = ['NCiso', False] # ORFs that have at least one full codon overlapping (in-frame) with a CDS are isoforms (NCiso => "N- and C-terminal isoform") # Note that these must already differ at N- and C- termini, otherwise they would already have been classified # INTERNAL if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_internal = (sametrans['tcoord'] > sametrans['tcoord_annot']) & (sametrans['tstop'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_internal, 'orfname']), ['orftype', 'untyped']] = ['internal', False] # ORFs completely contained within a CDS on the same transcript, and not containing any full codon overlaps, must be internal # Still could be other ORFs internal to a CDS on a transcript not in the current transcriptome - need to check manually if tfam_orfs['untyped'].any() and not cds_info['found'].all(): for (orfname, gcoord, gstop) in \ tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'gcoord', 'gstop']].drop_duplicates('orfname').itertuples(False): orf_pos = _get_orf_pos(orfname) # should be cached by now if strand == '-': if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if gcoord >= pos > gstop) for cds_pos_set in cds_info.loc[(~cds_info['found']) & (cds_info['gcoord'] > gcoord) & (cds_info['gstop'] < gstop), 'pos']): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['internal', False] else: if any(cds_pos_set.issuperset(orf_pos) and all(pos in orf_pos for pos in cds_pos_set if gcoord <= pos < gstop) for cds_pos_set in cds_info.loc[(~cds_info['found']) & (cds_info['gcoord'] < gcoord) & (cds_info['gstop'] > gstop), 'pos']): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['internal', False] # STOP_OVERLAP if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_stopover = (sametrans['tcoord'] > sametrans['tcoord_annot']) & (sametrans['tcoord'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_stopover, 'orfname']), ['orftype', 'untyped']] = ['stop_overlap', False] # starts within a CDS and not an internal - must be a stop_overlap # do not need to check for unfounds - requiring that stop_overlap must be on same transcript as cds # START_OVERLAP if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_startover = (sametrans['tstop'] > sametrans['tcoord_annot']) & (sametrans['tstop'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_startover, 'orfname']), ['orftype', 'untyped']] = ['start_overlap', False] # ends within a CDS and not an internal - must be a start_overlap # do not need to check for unfounds - requiring that start_overlap must be on same transcript as cds # LOOF if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_loof = (sametrans['tcoord'] < sametrans['tcoord_annot']) & (sametrans['tstop'] > sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_loof, 'orfname']), ['orftype', 'untyped']] = ['LOOF', False] # starts upstream of a CDS and ends downstream of it - must be a LOOF (long out-of-frame) # don't need to check for unfounds because the CDS must be on the same transcript as the ORF if the ORF completely contains it # UPSTREAM if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_upstream = (sametrans['tstop'] <= sametrans['tcoord_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_upstream, 'orfname']), ['orftype', 'untyped']] = ['upstream', False] # ends upstream of a CDS - must be an upstream (uORF) # cannot check manually for unfounds because those are not on well-defined transcripts # DOWNSTREAM if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge(tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_downstream = (sametrans['tstop_annot'] <= sametrans['tcoord']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[sametrans_downstream, 'orfname']), ['orftype', 'untyped']] = ['downstream', False] # starts downstream of a CDS - must be an upstream (uORF) # cannot check manually for unfounds because those are not on well-defined transcripts # NEW_ISO and GISO for orfname in tfam_orfs.loc[tfam_orfs['untyped'], 'orfname'].drop_duplicates(): if all_annot_pos.isdisjoint(_get_orf_pos(orfname)): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['new_iso', False] # no overlaps whatsoever with any annotated CDS, but in a tfam that has annotations: new_iso else: tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = ['Giso', False] # overlaps out-of-frame with a CDS, and not on the same transcript with a CDS: Giso => "genomic isoform" assert not tfam_orfs['untyped'].any() return tfam_orfs.drop('untyped', axis=1) else: return None
def _get_tid_info(tup): """For each transcript on this chromosome/strand, identifies every sub-sequence of the appropriate length (fpsize), converts it to an integer, identifies the number of reads mapping to that position, and outputs all of that information to a pandas HDF store.""" (chrom, strand) = tup inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = BAMGenomeArray(inbams, mapping=FivePrimeMapFactory(psite)) # map to roughly the center of each read so that identical sequences that cross different splice sites # (on different transcripts) still end up mapping to the same place gnd.add_filter('size', SizeFilterFactory(opts.minlen, opts.maxlen)) tid_seq_info = [] tid_summary = pd.DataFrame( { 'chrom': chrom, 'strand': strand, 'n_psite': -1, 'n_reads': -1, 'peak_reads': -1, 'dropped': '' }, index=pd.Index(bedlinedict[(chrom, strand)].keys(), name='tid')) for (tid, line) in bedlinedict[(chrom, strand)].iteritems(): currtrans = SegmentChain.from_bed(line) curr_pos_list = currtrans.get_position_list() # not in stranded order! if strand == '-': curr_pos_list = curr_pos_list[::-1] n_psite = len(curr_pos_list) + 1 - fpsize tid_summary.at[tid, 'n_psite'] = n_psite if n_psite > 0: curr_counts = np.array(currtrans.get_counts(gnd))[psite:n_psite + psite] # if((curr_counts>0).any()): sumcounts = curr_counts.sum() maxcounts = curr_counts.max() tid_summary.at[tid, 'n_reads'] = sumcounts tid_summary.at[tid, 'peak_reads'] = maxcounts if sumcounts >= opts.minreads: if maxcounts < sumcounts * opts.peakfrac: numseq = np.array( list( currtrans.get_sequence(genome).upper().translate( str_dict))) curr_seq = ''.join(numseq) tid_seq_info.append( pd.DataFrame({ 'tid': tid, 'genpos': curr_pos_list[psite:n_psite + psite], 'seq': np.array([(int(curr_seq[i:i + fpsize], 4) if 'N' not in curr_seq[i:i + fpsize] else -1) for i in xrange(n_psite)], dtype=np.int64), 'reads': curr_counts })) else: tid_summary.at[tid, 'dropped'] = 'peakfrac' else: tid_summary.at[tid, 'dropped'] = 'lowreads' if tid_seq_info: # don't bother saving anything if there's nothing to save pd.concat(tid_seq_info, ignore_index=True).to_hdf(seq_info_hdf % (chrom, strand), 'tid_seq_info', format='t', data_columns=True, complevel=1, complib='blosc') # sp.call(['ptrepack', orig_store_name, seq_info_hdf%(chrom,strand)]) # repack for efficiency # os.remove(orig_store_name) if opts.verbose > 1: with log_lock: logprint('%s (%s strand) complete' % (chrom, strand)) for inbam in inbams: inbam.close() return tid_summary
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df
def _regress_tfam(orf_set, gnd): """Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile() Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False""" tfam = orf_set['tfam'].iat[0] strand = orf_set['strand'].iat[0] chrom = orf_set['chrom'].iat[0] tids = orf_set['tid'].drop_duplicates().tolist() all_tfam_genpos = set() tid_genpos = {} tlens = {} for (i, tid) in enumerate(tids): currtrans = SegmentChain.from_bed(bedlinedict[tid]) curr_pos_set = currtrans.get_position_set() tlens[tid] = len(curr_pos_set) tid_genpos[tid] = curr_pos_set all_tfam_genpos.update(curr_pos_set) tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos))) all_tfam_genpos = np.array(sorted(all_tfam_genpos)) if strand == '-': all_tfam_genpos = all_tfam_genpos[::-1] nnt = len(all_tfam_genpos) tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True)) for (tid, curr_tid_genpos) in tid_genpos.iteritems()} hashed_counts = get_hashed_counts(tfam_segs, gnd) counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic for (i, rdlen) in enumerate(rdlens): for nmis in range(1+opts.max5mis): counts[i, :] += hashed_counts[(rdlen, nmis)] counts = counts.ravel() if opts.startcount: # Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each # try: orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in [tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]] if orf_set.empty: return failure_return orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True) abort_set = orf_set.drop_duplicates('gcoord').copy() abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True) if not opts.startonly: # if marking full ORFs, include histop model stop_set = orf_set.drop_duplicates('gstop').copy() stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x)) orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True) orf_profs = [] indices = [] for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False): if tcoord != tstop: # not a histop tlen = tlens[tid] if tcoord+startnt[0] < 0: startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0 else: startadj = 0 if tstop+stopnt[1] > tlen: stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0 else: stopadj = 0 curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj] orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel()) else: # histop curr_indices = tid_indices[tid][tstop-6:tstop] orf_profs.append(stopprof[:, -6:].ravel()) indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))])) # need to tile the indices for each read length if len(indices[-1]) != len(orf_profs[-1]): raise AssertionError('ORF length does not match index length') orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs), np.concatenate(indices), np.cumsum([0]+[len(curr_indices) for curr_indices in indices])), shape=(nnt*len(rdlens), len(orf_strength_df))) # better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0) if len(nonzero_orfs) == 0: # no possibility of anything coming up return failure_return orf_matrix = orf_matrix[:, nonzero_orfs] orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length (orf_strs, resid) = nnls(orf_matrix.toarray(), counts) min_str = 1e-6 # allow for machine rounding error usable_orfs = orf_strs > min_str if not usable_orfs.any(): return failure_return orf_strength_df = orf_strength_df[usable_orfs] orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts orf_strs = orf_strs[usable_orfs] orf_strength_df['orf_strength'] = orf_strs covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df)) # homoscedastic version (assume equal variance at all positions) # resids = counts-orf_matrix.dot(orf_strs) # simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray()) # covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids)))) # .dot(orf_matrix).dot(simple_covmat)) # # heteroscedastic version (Eicker-Huber-White robust estimator) orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat) orf_strength_df.set_index('orfname', inplace=True) elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord']) if opts.startonly: # count abortive initiation events towards start strength in this case include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop']) if not include_starts.any(): return failure_return # no need to keep going if there weren't any useful starts gcoord_grps = orf_strength_df[include_starts].groupby('gcoord') # even if we are willing to count abinit towards start strength, we certainly shouldn't count histop covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)] orf_strs_starts = orf_strs[include_starts.values] else: if not elongating_orfs.any(): return failure_return gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord') covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] orf_strs_starts = orf_strs[elongating_orfs.values] start_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('codon', gcoord_grps['codon'].first()), ('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))]) start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)])) .dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()}) if not opts.startonly: # count histop towards the stop codon - but still exclude abinit include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop'])) gstop_grps = orf_strength_df[include_stops].groupby('gstop') covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)] orf_strs_stops = orf_strs[include_stops.values] stop_strength_df = pd.DataFrame.from_items([('tfam', tfam), ('chrom', orf_set['chrom'].iloc[0]), ('strand', orf_set['strand'].iloc[0]), ('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))]) stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)])) .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) # # nohistop # gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop') # covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)] # orf_strs_stops = orf_strs[elongating_orfs.values] # stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum) # stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)])) # .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()}) return orf_strength_df, start_strength_df, stop_strength_df else: return orf_strength_df, start_strength_df
def _identify_tfam_orfs(tup): """Identify all of the possible ORFs within a family of transcripts. Relevant information such as genomic start and stop positions, amino acid length, and initiation codon will be collected for each ORF. Additionally, each ORF will be assigned a unique 'orfname', such that if it occurs on multiple transcripts, it can be recognized as the same ORF.""" (tfam, tids) = tup currtfam = SegmentChain.from_bed(tfambedlines[tfam]) chrom = currtfam.chrom strand = currtfam.strand tfam_genpos = np.array(currtfam.get_position_list()) if strand == '-': tfam_genpos = tfam_genpos[::-1] tmask = np.empty( (len(tids), len(tfam_genpos)), dtype=np.bool) # True if transcript covers that position, False if not tfam_orfs = [] tidx_lookup = {} for tidx, tid in enumerate(tids): tidx_lookup[tid] = tidx curr_trans = Transcript.from_bed(bedlinedict[tid]) tmask[tidx, :] = np.in1d(tfam_genpos, curr_trans.get_position_list(), assume_unique=True) trans_orfs = _find_all_orfs(curr_trans.get_sequence(genome).upper()) if trans_orfs: (startpos, stoppos, codons) = zip(*trans_orfs) startpos = np.array(startpos, dtype='i4') stoppos = np.array(stoppos, dtype='i4') gcoords = np.array( [curr_trans.get_genomic_coordinate(x)[1] for x in startpos], dtype='i4') stop_present = (stoppos > 0) gstops = np.zeros(len(trans_orfs), dtype='i4') gstops[stop_present] = \ np.array([curr_trans.get_genomic_coordinate(x - 1)[1] for x in stoppos[stop_present]]) + (1 if strand == '+' else -1) # the decrementing/incrementing stuff preserves half-openness regardless of strand AAlens = np.zeros(len(trans_orfs), dtype='i4') AAlens[stop_present] = (stoppos[stop_present] - startpos[stop_present]) / 3 - 1 tfam_orfs.append( pd.DataFrame.from_items([('tfam', tfam), ('tid', tid), ('tcoord', startpos), ('tstop', stoppos), ('chrom', chrom), ('gcoord', gcoords), ('gstop', gstops), ('strand', strand), ('codon', codons), ('AAlen', AAlens), ('orfname', '')])) if any(x is not None for x in tfam_orfs): orf_pos_dict = {} tfam_orfs = pd.concat(tfam_orfs, ignore_index=True) for ((gcoord, AAlen), gcoord_grp) in tfam_orfs.groupby( ['gcoord', 'AAlen']): # group by genomic start position and length if len(gcoord_grp) == 1: tfam_orfs.loc[gcoord_grp.index, 'orfname'] = _name_orf(tfam, gcoord, AAlen) else: orf_gcoords = np.vstack( np.flatnonzero(tmask[tidx_lookup[tid], :])[tcoord:tstop] for (tid, tcoord, tstop) in gcoord_grp[ ['tid', 'tcoord', 'tstop']].itertuples(False)) if (orf_gcoords == orf_gcoords[0, :]).all( ): # all of the grouped ORFs are identical, so should receive the same name orfname = _name_orf(tfam, gcoord, AAlen) tfam_orfs.loc[gcoord_grp.index, 'orfname'] = orfname orf_pos_dict[orfname] = tfam_genpos[orf_gcoords[0, :]] else: named_so_far = 0 unnamed = np.ones(len(gcoord_grp), dtype=np.bool) basename = _name_orf(tfam, gcoord, AAlen) while unnamed.any(): next_gcoords = orf_gcoords[unnamed, :][0, :] identicals = (orf_gcoords == next_gcoords).all(1) orfname = '%s_%d' % (basename, named_so_far) tfam_orfs.loc[gcoord_grp.index[identicals], 'orfname'] = orfname orf_pos_dict[orfname] = tfam_genpos[next_gcoords] unnamed[identicals] = False named_so_far += 1 # Now that the ORFs have been found and named, figure out their orftype tfam_orfs['annot_start'] = False tfam_orfs[ 'annot_stop'] = False # start out assuming all are False; replace with True as needed tfam_orfs['orftype'] = 'new' tfam_orfs['untyped'] = tfam_orfs['tstop'] > 0 tfam_orfs.loc[~tfam_orfs['untyped'], 'orftype'] = 'nonstop' # no stop codon if tfam in tfams_with_annots: cds_info = [] all_annot_pos = set() for (annot_fidx, (annot_tfam_lookup, annot_tid_lookup)) in enumerate( zip(annot_tfam_lookups, annot_tid_lookups)): if tfam in annot_tfam_lookup: for (annot_tidx, annot_tid) in enumerate(annot_tfam_lookup[tfam]): curr_trans = Transcript.from_bed( annot_tid_lookup[annot_tid]) if curr_trans.cds_start is not None and curr_trans.cds_end is not None: curr_cds_pos_set = curr_trans.get_cds( ).get_position_set() curr_len = len(curr_cds_pos_set) if curr_len % 3 == 0: curr_gcoord = curr_trans.get_genomic_coordinate( curr_trans.cds_start)[1] curr_gstop = curr_trans.get_genomic_coordinate( curr_trans.cds_end - 1)[1] + (1 if strand == '+' else -1) in_tfam = curr_cds_pos_set.issubset( tfam_genpos) cds_info.append( (curr_gcoord, curr_gstop, (curr_len - 3) / 3, in_tfam, annot_fidx, annot_tid, curr_cds_pos_set)) all_annot_pos.update(curr_cds_pos_set) if cds_info: # False means no annotated CDSs or none are multiples of 3 in length cds_info = pd.DataFrame(cds_info, columns=['gcoord', 'gstop', 'AAlen', 'in_tfam', 'annot_fidx', 'annot_tid', 'pos']) \ .groupby(['gcoord', 'gstop', 'AAlen', 'in_tfam'], as_index=False) \ .apply(lambda x: x if len(x) == 1 else x[[not any(pos == x['pos'].iat[j] for j in xrange(i)) for (i, pos) in enumerate(x['pos'])]]) \ .set_index(['annot_fidx', 'annot_tid']) # this operation organizes cds_info into a dataframe and effectively drops duplicates # pandas drop_duplicates() is incompatible with sets so have to do it this manual way # the combination of annot_fidx (the number of the file if more than one annotation file provided) and annot_tid should be a unique ID tfam_orfs['annot_start'] = tfam_orfs['gcoord'].isin( cds_info['gcoord']) tfam_orfs['annot_stop'] = tfam_orfs['gstop'].isin( cds_info['gstop']) def _get_orf_pos(orfname, tid=None, tcoord=None, tstop=None): """Helper function that identifies the genomic coordinates of an ORF (in stranded order) and caches them by orfname""" if orfname in orf_pos_dict: return orf_pos_dict[orfname] else: if tid is None or tcoord is None or tstop is None: (tid, tcoord, tstop) = tfam_orfs.loc[ tfam_orfs['orfname'] == orfname, ['tid', 'tcoord', 'tstop']].iloc[0] res = tfam_genpos[np.flatnonzero( tmask[tidx_lookup[tid], :])[tcoord:tstop]] orf_pos_dict[orfname] = res return res # ANNOTATED and XISO cds_info['found'] = False possible_annot = tfam_orfs.drop_duplicates('orfname').merge( cds_info[cds_info['in_tfam']].reset_index()) # merges on gcoord, gstop, and len - need to reset_index to preserve annot_fidx and annot_tid for ((orfname, tid, tcoord, tstop), cds_grp) in possible_annot.groupby( ['orfname', 'tid', 'tcoord', 'tstop']): orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop) for (annot_fidx, annot_tid, cds_pos_set) in cds_grp[[ 'annot_fidx', 'annot_tid', 'pos' ]].itertuples(False): if cds_pos_set.issubset(orf_pos): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'annotated', False ] cds_info.loc[(annot_fidx, annot_tid), 'found'] = True break else: tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'Xiso', False ] # matching start and stop but differing in between if tfam_orfs['untyped'].any(): tfam_orfs.loc[tfam_orfs['orfname'].isin( tfam_orfs[tfam_orfs['untyped']]. merge(cds_info[['gcoord', 'gstop']])['orfname']), ['orftype', 'untyped']] = ['Xiso', False] # matching start and stop, but must differ somewhere, otherwise would have been identified as annotated (Xiso => "exact isoform") # SISO tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['annot_stop'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Siso', False] # start and stop each match at least one CDS, but not the same one (Siso => "spliced isoform") # CISO tfam_orfs.loc[tfam_orfs['annot_start'] & tfam_orfs['untyped'], ['orftype', 'untyped']] = ['Ciso', False] # start is annotated, but stop is not - so must be on a new transcript (Ciso => "C-terminal isoform") # TRUNCATION if tfam_orfs['untyped'].any(): found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on=['tid', 'tstop'], suffixes=('', '_annot')) tfam_orfs.loc[ tfam_orfs['orfname'].isin(found_matched_stop.loc[ found_matched_stop['tcoord'] > found_matched_stop['tcoord_annot'], 'orfname']), ['orftype', 'untyped']] = ['truncation', False] # on the same transcript with an annotated CDS, with matching stop codon, initiating downstream - must be a truncation # still some missing truncations, if the original CDS was not on a transcript in the present transcriptome if tfam_orfs['untyped'].any() and not cds_info['found'].all(): possible_truncs = tfam_orfs[tfam_orfs['untyped']].drop_duplicates('orfname') \ .merge(cds_info.loc[~cds_info['found'], ['gstop', 'pos', 'AAlen']], on='gstop', suffixes=('', '_annot')) possible_truncs = possible_truncs[ possible_truncs['AAlen'] < possible_truncs['AAlen_annot']] for ((orfname, tid, tcoord, tstop, gcoord), cds_pos_sets) in \ possible_truncs.groupby(['orfname', 'tid', 'tcoord', 'tstop', 'gcoord'])['pos']: orf_pos = _get_orf_pos(orfname, tid, tcoord, tstop) if strand == '-': if any( cds_pos_set.issuperset(orf_pos) and all( pos in orf_pos for pos in cds_pos_set if pos <= gcoord) for cds_pos_set in cds_pos_sets): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'truncation', False ] else: if any( cds_pos_set.issuperset(orf_pos) and all( pos in orf_pos for pos in cds_pos_set if pos >= gcoord) for cds_pos_set in cds_pos_sets): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'truncation', False ] # matching stop codon, contained within, and all positions in the annotation past the orf start codon are included in the orf # EXTENSION if tfam_orfs['untyped'].any(): found_matched_stop = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on=['tid', 'tstop'], suffixes=('', '_annot')) assert (found_matched_stop['tcoord'] < found_matched_stop['tcoord_annot']).all( ) # other possibilities should be done by now tfam_orfs.loc[tfam_orfs['orfname']. isin(found_matched_stop['orfname']), ['orftype', 'untyped']] = [ 'extension', False ] # on the same transcript with an annotated CDS, with matching stop codon, initiating upstream - must be an extension # no possibility for an "unfound" extension - if the extension is in the transcriptome, the CDS it comes from must be as well # (except for a few edge cases e.g. annotated CDS is a CUG initiator, but not considering CUG ORFs) # NISO tfam_orfs.loc[tfam_orfs['annot_stop'] & (tfam_orfs['untyped']), ['orftype', 'untyped']] = ['Niso', False] # stop is annotated, but start is not, and it's not a truncation or extension - so must be an isoform (Niso => "N-terminal isoform") # NCISO if tfam_orfs['untyped'].any(): orf_codons = [] for (orfname, tid, tcoord, tstop) in \ tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'tid', 'tcoord', 'tstop']].drop_duplicates('orfname').itertuples(False): orf_codons.append( pd.DataFrame( _get_orf_pos(orfname, tid, tcoord, tstop).reshape((-1, 3)))) orf_codons[-1]['orfname'] = orfname orf_codons = pd.concat(orf_codons, ignore_index=True) if strand == '-': annot_codons = pd.DataFrame( np.vstack([ np.reshape(sorted(cds_pos_set, reverse=True), (-1, 3)) for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0 ])).drop_duplicates() else: annot_codons = pd.DataFrame( np.vstack([ np.reshape(sorted(cds_pos_set, reverse=False), (-1, 3)) for cds_pos_set in cds_info['pos'] if len(cds_pos_set) % 3 == 0 ])).drop_duplicates() tfam_orfs.loc[tfam_orfs['orfname'].isin( orf_codons.merge(annot_codons)['orfname']), ['orftype', 'untyped']] = ['NCiso', False] # ORFs that have at least one full codon overlapping (in-frame) with a CDS are isoforms (NCiso => "N- and C-terminal isoform") # Note that these must already differ at N- and C- termini, otherwise they would already have been classified # INTERNAL if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_internal = ( sametrans['tcoord'] > sametrans['tcoord_annot']) & ( sametrans['tstop'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[ sametrans_internal, 'orfname']), ['orftype', 'untyped']] = [ 'internal', False ] # ORFs completely contained within a CDS on the same transcript, and not containing any full codon overlaps, must be internal # Still could be other ORFs internal to a CDS on a transcript not in the current transcriptome - need to check manually if tfam_orfs['untyped'].any() and not cds_info['found'].all(): for (orfname, gcoord, gstop) in \ tfam_orfs.loc[tfam_orfs['untyped'], ['orfname', 'gcoord', 'gstop']].drop_duplicates('orfname').itertuples(False): orf_pos = _get_orf_pos( orfname) # should be cached by now if strand == '-': if any( cds_pos_set.issuperset(orf_pos) and all( pos in orf_pos for pos in cds_pos_set if gcoord >= pos > gstop) for cds_pos_set in cds_info.loc[ (~cds_info['found']) & (cds_info['gcoord'] > gcoord) & (cds_info['gstop'] < gstop), 'pos']): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'internal', False ] else: if any( cds_pos_set.issuperset(orf_pos) and all( pos in orf_pos for pos in cds_pos_set if gcoord <= pos < gstop) for cds_pos_set in cds_info.loc[ (~cds_info['found']) & (cds_info['gcoord'] < gcoord) & (cds_info['gstop'] > gstop), 'pos']): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'internal', False ] # contained within, and all positions in the annotation between the orf start and stop codons are included in the orf # STOP_OVERLAP if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_stopover = ( sametrans['tcoord'] > sametrans['tcoord_annot']) & ( sametrans['tcoord'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[ sametrans_stopover, 'orfname']), ['orftype', 'untyped']] = [ 'stop_overlap', False ] # starts within a CDS and not an internal - must be a stop_overlap # do not need to check for unfounds - requiring that stop_overlap must be on same transcript as cds # START_OVERLAP if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_startover = ( sametrans['tstop'] > sametrans['tcoord_annot']) & ( sametrans['tstop'] < sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[ sametrans_startover, 'orfname']), ['orftype', 'untyped']] = [ 'start_overlap', False ] # ends within a CDS and not an internal - must be a start_overlap # do not need to check for unfounds - requiring that start_overlap must be on same transcript as cds # LOOF if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_loof = ( sametrans['tcoord'] < sametrans['tcoord_annot']) & ( sametrans['tstop'] > sametrans['tstop_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[ sametrans_loof, 'orfname']), ['orftype', 'untyped']] = ['LOOF', False] # starts upstream of a CDS and ends downstream of it - must be a LOOF ("long out-of-frame") # don't need to check for unfounds because the CDS must be on the same transcript as the ORF if the ORF completely contains it # UPSTREAM if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_upstream = (sametrans['tstop'] <= sametrans['tcoord_annot']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[ sametrans_upstream, 'orfname']), ['orftype', 'untyped']] = [ 'upstream', False ] # ends upstream of a CDS - must be an upstream (uORF) # cannot check manually for unfounds because those are not on well-defined transcripts # DOWNSTREAM if tfam_orfs['untyped'].any(): sametrans = tfam_orfs[tfam_orfs['untyped']].merge( tfam_orfs[tfam_orfs['orftype'] == 'annotated'], on='tid', suffixes=('', '_annot')) sametrans_downstream = (sametrans['tstop_annot'] <= sametrans['tcoord']) tfam_orfs.loc[tfam_orfs['orfname'].isin(sametrans.loc[ sametrans_downstream, 'orfname']), ['orftype', 'untyped']] = [ 'downstream', False ] # starts downstream of a CDS - must be a downstream ORF # cannot check manually for unfounds because those are not on well-defined transcripts # NEW_ISO and GISO for orfname in tfam_orfs.loc[tfam_orfs['untyped'], 'orfname'].drop_duplicates(): if all_annot_pos.isdisjoint(_get_orf_pos(orfname)): tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'new_iso', False ] # no overlaps whatsoever with any annotated CDS, but in a tfam that has annotations: new_iso else: tfam_orfs.loc[tfam_orfs['orfname'] == orfname, ['orftype', 'untyped']] = [ 'Giso', False ] # overlaps out-of-frame with a CDS, and not on the same transcript with a CDS: Giso => "genomic isoform" assert not tfam_orfs['untyped'].any() return tfam_orfs.drop('untyped', axis=1) else: return None