def from_transcript(t, ref=None): self = MatchStats() self.transcript_id = t.attrs[GTFAttr.TRANSCRIPT_ID] self.gene_id = t.attrs[GTFAttr.GENE_ID] self.locus = '%s:%d-%d[%s]' % (t.chrom, t.start, t.end, strand_int_to_str(t.strand)) self.length = t.length self.num_introns = len(t.exons) - 1 if ref is not None: self.ref_transcript_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] self.ref_gene_id = ref.attrs[GTFAttr.GENE_ID] self.ref_locus = '%s:%d-%d[%s]' % (ref.chrom, ref.start, ref.end, strand_int_to_str(ref.strand)) self.ref_length = ref.length self.ref_num_introns = len(ref.exons) - 1 self.ref_orig_gene_id = ref.attrs.get('orig_gene_id', self.ref_gene_id) self.ref_source = ref.attrs.get('source', 'NA') if 'gene_name' in ref.attrs: self.ref_gene_name = ref.attrs['gene_name'] elif 'transcript_name' in ref.attrs: self.ref_gene_name = ref.attrs['transcript_name'] else: self.ref_gene_name = self.ref_gene_id if 'gene_type' in ref.attrs: self.ref_gene_type = ref.attrs['gene_type'] elif 'gene_biotype' in ref.attrs: self.ref_gene_type = ref.attrs['gene_biotype'] elif 'transcript_type' in ref.attrs: self.ref_gene_type = ref.attrs['transcript_type'] else: self.ref_gene_type = 'None' return self
def write_bed(chrom, name, strand, score, exons, flank, chrom_length): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) chr_len = chrom_length[chrom] tx_start = exons[0].start tx_start = max(0, (tx_start - flank)) tx_end = exons[-1].end tx_end = min(chr_len, (tx_end + flank)) block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [chrom, str(tx_start), str(tx_end), str(name), str(score), strand_int_to_str(strand), str(tx_start), str(tx_start), '0', str(len(exons)), ','.join(map(str,block_sizes)) + ',', ','.join(map(str,block_starts)) + ','] return fields
def find_first_orf(t, ref_fa): orf = ORFInfo() orf.transcript_id = t.attrs['transcript_id'] orf.gene_id = t.attrs['gene_id'] orf.chrom = t.chrom # get transcript sequence seq = get_transcript_dna_sequence(t, ref_fa) # find first ATG in sequence start = seq.find('ATG') if start == -1: orf.start = t.start orf.end = t.start orf.strand = '.' orf.exons = [] orf.seq = '' else: aa_seq = translate_orf(seq[start:]) end = start + 3 * len(aa_seq) - 1 orf_start, orf_end, orf_exons = \ orf_to_genome(t, start, end) orf.start = orf_start orf.end = orf_end orf.strand = strand_int_to_str(t.strand) orf.exons = orf_exons orf.seq = aa_seq return orf
def write_bed(chrom, name, strand, score, exons): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) tx_start = exons[0].start tx_end = exons[-1].end block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [ chrom, str(tx_start), str(tx_end), str(name), str(score), strand_int_to_str(strand), str(tx_start), str(tx_start), "0", str(len(exons)), ",".join(map(str, block_sizes)) + ",", ",".join(map(str, block_starts)) + ",", ] return fields
def assemble_locus(transcripts, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, config, gtf_fileh, bed_fileh, bedgraph_filehs): # gather properties of locus locus_chrom = transcripts[0].chrom locus_start = transcripts[0].start locus_end = max(tx.end for tx in transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(transcripts))) locus_id_str = "L%d" % (locus_id_value_obj.next()) # filter transcripts logging.debug("\tFiltering transcripts") transcripts = filter_transcripts(transcripts, config.min_transcript_length, config.guided) # build transcript graphs transcript_graphs = \ create_transcript_graphs(locus_chrom, transcripts, min_trim_length=config.min_trim_length, trim_utr_fraction=config.trim_utr_fraction, trim_intron_fraction=config.trim_intron_fraction, create_bedgraph=config.create_bedgraph, bedgraph_filehs=bedgraph_filehs) for tg in transcript_graphs: logging.debug( "Subgraph %s:%d-%d(%s) %d nodes %d paths" % (locus_chrom, locus_start, locus_end, strand_int_to_str( tg.strand), len(tg.Gsub), len(tg.partial_paths))) # assemble subgraph assemble_gene(locus_chrom, locus_id_str, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, tg.Gsub, tg.strand, tg.partial_paths, config, gtf_fileh, bed_fileh)
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = strand_int_to_str(strand) attr_dict = {"locus_id": locus_id, "gene_id": gene_id, "tss_id": tss_id, "transcript_id": transcript_id} f = GTFFeature() f.seqid = chrom f.source = "assemblyline" f.feature_type = "transcript" f.start = tx_start f.end = tx_end f.score = 1000.0 * int(round(frac)) f.strand = strand_str f.phase = "." f.attrs = {"score": "%.3f" % score, "frac": "%.3f" % frac} f.attrs.update(attr_dict) yield f for i, e in enumerate(exons): f = GTFFeature() f.seqid = chrom f.source = "assemblyline" f.feature_type = "exon" f.start = e.start f.end = e.end f.score = int(round(frac)) f.strand = strand_str f.phase = "." f.attrs = {"exon_number": i + 1} f.attrs.update(attr_dict) yield f
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac): tx_start = exons[0].start tx_end = exons[-1].end strand_str = strand_int_to_str(strand) attr_dict = {'locus_id': locus_id, 'gene_id': gene_id, 'tss_id': tss_id, 'transcript_id': transcript_id} f = GTFFeature() f.seqid = chrom f.source = 'assemblyline' f.feature_type = 'transcript' f.start = tx_start f.end = tx_end f.score = 1000.0 * int(round(frac)) f.strand = strand_str f.phase = '.' f.attrs = {'score': '%.3f' % score, 'frac': '%.3f' % frac} f.attrs.update(attr_dict) yield f for i,e in enumerate(exons): f = GTFFeature() f.seqid = chrom f.source = 'assemblyline' f.feature_type = 'exon' f.start = e.start f.end = e.end f.score = int(round(frac)) f.strand = strand_str f.phase = '.' f.attrs = {'exon_number': i+1} f.attrs.update(attr_dict) yield f
def assemble_locus( transcripts, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, config, gtf_fileh, bed_fileh, bedgraph_filehs, ): def get_bedgraph_lines(chrom, G): for n in sorted(G.nodes()): if n.start < 0: continue fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) yield fields # gather properties of locus locus_chrom = transcripts[0].chrom locus_start = transcripts[0].start locus_end = max(tx.end for tx in transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(transcripts))) locus_id_str = "L%d" % (locus_id_value_obj.next()) # filter transcripts logging.debug("\tFiltering transcripts") transcripts = filter_transcripts(transcripts, config.min_transcript_length, config.guided) # build transcript graphs for G, strand, strand_transcripts in create_transcript_graphs(transcripts): # output bedgraph if config.create_bedgraph: for fields in get_bedgraph_lines(locus_chrom, G): print >>bedgraph_filehs[strand], "\t".join(map(str, fields)) # process transcript graphs for Gsub, strand, partial_paths in prune_transcript_graph( G, strand, strand_transcripts, config.min_trim_length, config.trim_utr_fraction, config.trim_intron_fraction ): logging.debug( "Subgraph %s:%d-%d(%s) %d nodes %d paths" % (locus_chrom, locus_start, locus_end, strand_int_to_str(strand), len(Gsub), len(partial_paths)) ) # assemble subgraph assemble_gene( locus_chrom, locus_id_str, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, Gsub, strand, partial_paths, config, gtf_fileh, bed_fileh, )
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('-u', type=int, dest='upstream', default=1000) parser.add_argument('-d', type=int, dest='downstream', default=0) parser.add_argument('gtf_file') parser.add_argument('chrom_sizes') args = parser.parse_args() upstream = args.upstream downstream = args.downstream chrom_sizes_file = args.chrom_sizes # check command line parameters if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) chrom_sizes = {} with open(chrom_sizes_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom_sizes[fields[0]] = int(fields[1]) # parse for locus_transcripts in parse_gtf(open(args.gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) tss_ids = set() for t in locus_transcripts: if t.strand == NO_STRAND: continue tss_id = t.attrs['tss_id'] if tss_id in tss_ids: continue tss_ids.add(tss_id) if t.strand == POS_STRAND: start = t.exons[0].start - upstream start = max(0, start) end = t.exons[0].start + downstream end = min(t.end, end) else: start = t.exons[-1].end - downstream start = max(t.start, start) end = t.exons[-1].end + upstream end = min(end, chrom_sizes[locus_chrom]) print '\t'.join( map(str, [ locus_chrom, start, end, tss_id, 0, strand_int_to_str(t.strand) ])) return 0
def assemble_locus(transcripts, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, config, gtf_fileh, bed_fileh, bedgraph_filehs): def get_bedgraph_lines(chrom, G): for n in sorted(G.nodes()): if n.start < 0: continue fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) yield fields # gather properties of locus locus_chrom = transcripts[0].chrom locus_start = transcripts[0].start locus_end = max(tx.end for tx in transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(transcripts))) locus_id_str = "L%d" % (locus_id_value_obj.next()) # filter transcripts logging.debug("\tFiltering transcripts") transcripts = filter_transcripts(transcripts, config.min_transcript_length, config.guided) # build transcript graphs for G, strand, strand_transcripts in \ create_transcript_graphs(transcripts): # output bedgraph if config.create_bedgraph: for fields in get_bedgraph_lines(locus_chrom, G): print >>bedgraph_filehs[strand], '\t'.join(map(str,fields)) # process transcript graphs for Gsub, strand, partial_paths in \ prune_transcript_graph(G, strand, strand_transcripts, config.min_trim_length, config.trim_utr_fraction, config.trim_intron_fraction): logging.debug("Subgraph %s:%d-%d(%s) %d nodes %d paths" % (locus_chrom, locus_start, locus_end, strand_int_to_str(strand), len(Gsub), len(partial_paths))) # assemble subgraph assemble_gene(locus_chrom, locus_id_str, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, Gsub, strand, partial_paths, config, gtf_fileh, bed_fileh)
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >>outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >>outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)] print '\t'.join(fields) return 0
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # parse command line parser = argparse.ArgumentParser() parser.add_argument('-u', type=int, dest='upstream', default=1000) parser.add_argument('-d', type=int, dest='downstream', default=0) parser.add_argument('gtf_file') parser.add_argument('chrom_sizes') args = parser.parse_args() upstream = args.upstream downstream = args.downstream chrom_sizes_file = args.chrom_sizes # check command line parameters if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) chrom_sizes = {} with open(chrom_sizes_file) as fileh: for line in fileh: fields = line.strip().split('\t') chrom_sizes[fields[0]] = int(fields[1]) # parse for locus_transcripts in parse_gtf(open(args.gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) tss_ids = set() for t in locus_transcripts: if t.strand == NO_STRAND: continue tss_id = t.attrs['tss_id'] if tss_id in tss_ids: continue tss_ids.add(tss_id) if t.strand == POS_STRAND: start = t.exons[0].start - upstream start = max(0, start) end = t.exons[0].start + downstream end = min(t.end, end) else: start = t.exons[-1].end - downstream start = max(t.start, start) end = t.exons[-1].end + upstream end = min(end, chrom_sizes[locus_chrom]) print '\t'.join(map(str, [locus_chrom, start, end, tss_id, 0, strand_int_to_str(t.strand)])) return 0
def assemble_locus(transcripts, locus_id_value_obj, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, config, gtf_fileh, bed_fileh, bedgraph_filehs): # gather properties of locus locus_chrom = transcripts[0].chrom locus_start = transcripts[0].start locus_end = max(tx.end for tx in transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(transcripts))) locus_id_str = "L%d" % (locus_id_value_obj.next()) # filter transcripts logging.debug("\tFiltering transcripts") transcripts = filter_transcripts(transcripts, config.min_transcript_length, config.guided) # build transcript graphs transcript_graphs = \ create_transcript_graphs(locus_chrom, transcripts, min_trim_length=config.min_trim_length, trim_utr_fraction=config.trim_utr_fraction, trim_intron_fraction=config.trim_intron_fraction, create_bedgraph=config.create_bedgraph, bedgraph_filehs=bedgraph_filehs) for tg in transcript_graphs: logging.debug("Subgraph %s:%d-%d(%s) %d nodes %d paths" % (locus_chrom, locus_start, locus_end, strand_int_to_str(tg.strand), len(tg.Gsub), len(tg.partial_paths))) # assemble subgraph assemble_gene(locus_chrom, locus_id_str, gene_id_value_obj, tss_id_value_obj, t_id_value_obj, tg.Gsub, tg.strand, tg.partial_paths, config, gtf_fileh, bed_fileh)
def get_all_transcript_orfs(t, ref_fa, min_orf_length): # get amino acid sequences in all reading frames aa_seqs = translate_transcript(t, ref_fa) # get ORFs for frame, aa_seq in enumerate(aa_seqs): for aa_start, aa_end, orf_seq in find_orfs(aa_seq): if len(orf_seq) < min_orf_length: continue orf_start, orf_end, orf_exons = \ orf_to_genome(t, frame, aa_start, aa_end) orf = ORFInfo() orf.transcript_id = t.attrs['transcript_id'] orf.gene_id = t.attrs['gene_id'] orf.frame = frame orf.chrom = t.chrom orf.start = orf_start orf.end = orf_end orf.strand = strand_int_to_str(t.strand) orf.exons = orf_exons orf.seq = orf_seq yield orf
def write_bed(chrom, name, strand, score, exons): assert all(exons[0].start < x.start for x in exons[1:]) assert all(exons[-1].end > x.end for x in exons[:-1]) tx_start = exons[0].start tx_end = exons[-1].end block_sizes = [] block_starts = [] for e in exons: block_starts.append(e.start - tx_start) block_sizes.append(e.end - e.start) # make bed fields fields = [chrom, str(tx_start), str(tx_end), str(name), str(score), strand_int_to_str(strand), str(tx_start), str(tx_start), '0', str(len(exons)), ','.join(map(str,block_sizes)) + ',', ','.join(map(str,block_starts)) + ','] return fields
def trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction): # get 'chains' of contiguous non-intron nodes with edge degree of # one or less node_chain_map, chains = get_chains(G, introns=False) # setup dictionaries of predecessors and successors successor_dict = {} for n, nbrdict in G.adjacency_iter(): successor_dict[n] = nbrdict.keys() predecessor_dict = {} G.reverse(copy=False) for n, nbrdict in G.adjacency_iter(): predecessor_dict[n] = nbrdict.keys() G.reverse(copy=False) # setup intron data structures introns = {} intron_tree = IntervalTree() reverse = (strand == NEG_STRAND) for u, nbrdict in G.adjacency_iter(): for v in nbrdict: if reverse: left, right = v, u else: left, right = u, v # skip contiguous nodes if left.end == right.start: continue # calculate score of the chains u_chain_nodes = chains[node_chain_map[u]] u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes) v_chain_nodes = chains[node_chain_map[v]] v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes) # store scores in intron data structures introns[(left.end, right.start)] = (u_score, v_score) intron_tree.insert_interval( Interval(left.end, right.start, value=(u_score, v_score))) # trim chains all_trim_nodes = set() for parent, nodes in chains.iteritems(): if strand == NEG_STRAND: nodes.reverse() in_degree = len(predecessor_dict[nodes[0]]) out_degree = len(successor_dict[nodes[-1]]) trim_nodes = set() if ((in_degree == 1) and (out_degree == 1) and (parent.start, parent.end) in introns): # intron retention - a chain of nodes precisely matches an # intron, so we can potentially remove the entire chain pred_score, succ_score = introns[(parent.start, parent.end)] cutoff_score = trim_intron_fraction * max(pred_score, succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) else: # determine whether this node chain is intronic. intronic node # chains are trimmed more strictly due to intronic pre-mrna found_intron = False max_pred_score = 0.0 max_succ_score = 0.0 for hit in intron_tree.find(parent.start, parent.end): # ignore contained introns if (hit.start > parent.start) and (hit.end < parent.end): continue # set intron flag and keep track of highest coverage # overlapping intron to make trimming conservative found_intron = True pred_score, succ_score = hit.value if pred_score > max_pred_score: max_pred_score = pred_score if succ_score > max_succ_score: max_succ_score = succ_score if (in_degree == 0) and (out_degree == 0): if found_intron: cutoff_score = trim_intron_fraction * max( max_pred_score, max_succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) trim_nodes.update( trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction)) elif in_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_succ_score trim_nodes.update(trim_intronic_utr( G, nodes, cutoff_score)) trim_nodes.update( trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction)) elif out_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_pred_score trim_nodes.update( trim_intronic_utr(G, nodes[::-1], cutoff_score)) trim_nodes.update( trim_utr(G, nodes, min_trim_length, trim_utr_fraction)) all_trim_nodes.update(trim_nodes) if len(all_trim_nodes) > 0: logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % (strand_int_to_str(strand), len(all_trim_nodes), len(G))) return all_trim_nodes
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", default=False) parser.add_argument("ref_gtf_file") parser.add_argument("gtf_file") args = parser.parse_args() # set logging level if args.verbose: level = logging.DEBUG else: level = logging.INFO logging.basicConfig( level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # check command line parameters if not os.path.exists(args.ref_gtf_file): parser.error("GTF file %s not found" % (args.ref_gtf_file)) if not os.path.exists(args.gtf_file): parser.error("GTF file %s not found" % (args.gtf_file)) logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # show parameters logging.info("Parameters:") logging.info("verbose logging: %s" % (args.verbose)) logging.info("ref gtf file: %s" % (args.ref_gtf_file)) logging.info("assembly gtf file: %s" % (args.gtf_file)) # find CDS regions if not os.path.exists('tmp.srt.gtf'): with open('tmp.gtf', 'w') as outfileh: logging.info("Reading CDS regions from reference GTF") for f in get_cds_features(args.ref_gtf_file): print >> outfileh, str(f) logging.info("Reading transcripts from assembly GTF") i = 0 for f in GTFFeature.parse(open(args.gtf_file)): print >> outfileh, str(f) i += 1 if i % 100000 == 0: logging.debug("Parsed %d transcripts" % (i)) logging.info("Sorting GTF file") sort_gtf('tmp.gtf', 'tmp.srt.gtf') for locus_transcripts in parse_gtf(open('tmp.srt.gtf')): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for start, end, strand, m, t, c in categorize(locus_transcripts): fields = [ locus_chrom, str(start), str(end), '%s|%s|%s' % (m, t, c), '0', strand_int_to_str(strand) ] print '\t'.join(fields) return 0
def categorize_transcript(t, locus_trees): # determine whether gene overlaps known loci # intersect transcript with reference loci t_strand = strand_int_to_str(t.strand) locus_hits = locus_trees[t.chrom].find(t.start, t.end) if len(locus_hits) == 0: # this is a completely unannotated transcript category = CATEGORY_INTERGENIC nearest_genes, nearest_dist = get_nearest_genes(t.chrom, t.start, t.end, locus_trees) else: # this transcript overlaps at least one known locus, so # categorize as sense/antisense, coding/noncoding, exon/intron protein_genes = {} ncrna_genes = {} antisense_genes = {} for locus_hit in locus_hits: gene_tree = locus_hit.value for exon in t.exons: for gene_hit in gene_tree.find(exon.start, exon.end): g = gene_hit.value if cmp_strand(g.strand, t_strand): if g.is_coding: protein_genes[g.gene_id] = g else: ncrna_genes[g.gene_id] = g else: antisense_genes[g.gene_id] = g protein_genes = protein_genes.values() ncrna_genes = ncrna_genes.values() antisense_genes = antisense_genes.values() nearest_dist = 0 if len(protein_genes) > 0: category = CATEGORY_PROTEIN nearest_genes = protein_genes elif len(ncrna_genes) > 0: category = CATEGORY_NCRNA nearest_genes = ncrna_genes elif len(antisense_genes) > 0: category = CATEGORY_ANTISENSE nearest_genes = antisense_genes else: category = CATEGORY_INTRONIC nearest_genes = [] for locus_hit in locus_hits: gene_tree = locus_hit.value gene_hits = gene_tree.find(locus_hit.start, locus_hit.end) for gene_hit in gene_hits: nearest_genes.append(gene_hit.value) # using 'nearest genes' list get gene names and annotation sources if len(nearest_genes) == 0: gene_ids = "NA" gene_names = "NA" annotation_sources = "NA" nearest_dist = -1 else: gene_ids = set() gene_names = set() annotation_sources = set() for g in nearest_genes: gene_ids.add(g.gene_id) gene_names.update(g.gene_names) annotation_sources.update(g.annotation_sources) gene_ids = ",".join(sorted(gene_ids)) gene_names = ",".join(sorted(gene_names)) annotation_sources = ",".join(sorted(annotation_sources)) # add attributes to original transcripts t.attrs["category"] = category t.attrs["nearest_gene_ids"] = gene_ids t.attrs["nearest_gene_names"] = gene_names t.attrs["nearest_dist"] = nearest_dist t.attrs["annotation_sources"] = annotation_sources
def trim_graph(G, strand, min_trim_length, trim_utr_fraction, trim_intron_fraction): # get 'chains' of contiguous non-intron nodes with edge degree of # one or less node_chain_map, chains = get_chains(G, introns=False) # setup dictionaries of predecessors and successors successor_dict = {} for n,nbrdict in G.adjacency_iter(): successor_dict[n] = nbrdict.keys() predecessor_dict = {} G.reverse(copy=False) for n,nbrdict in G.adjacency_iter(): predecessor_dict[n] = nbrdict.keys() G.reverse(copy=False) # setup intron data structures introns = {} intron_tree = IntervalTree() reverse = (strand == NEG_STRAND) for u,nbrdict in G.adjacency_iter(): for v in nbrdict: if reverse: left, right = v, u else: left, right = u, v # skip contiguous nodes if left.end == right.start: continue # calculate score of the chains u_chain_nodes = chains[node_chain_map[u]] u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes) v_chain_nodes = chains[node_chain_map[v]] v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes) # store scores in intron data structures introns[(left.end,right.start)] = (u_score, v_score) intron_tree.insert_interval(Interval(left.end, right.start, value=(u_score,v_score))) # trim chains all_trim_nodes = set() for parent, nodes in chains.iteritems(): if strand == NEG_STRAND: nodes.reverse() in_degree = len(predecessor_dict[nodes[0]]) out_degree = len(successor_dict[nodes[-1]]) trim_nodes = set() if ((in_degree == 1) and (out_degree == 1) and (parent.start, parent.end) in introns): # intron retention - a chain of nodes precisely matches an # intron, so we can potentially remove the entire chain pred_score, succ_score = introns[(parent.start, parent.end)] cutoff_score = trim_intron_fraction * max(pred_score, succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) else: # determine whether this node chain is intronic. intronic node # chains are trimmed more strictly due to intronic pre-mrna found_intron = False max_pred_score = 0.0 max_succ_score = 0.0 for hit in intron_tree.find(parent.start, parent.end): # ignore contained introns if (hit.start > parent.start) and (hit.end < parent.end): continue # set intron flag and keep track of highest coverage # overlapping intron to make trimming conservative found_intron = True pred_score, succ_score = hit.value if pred_score > max_pred_score: max_pred_score = pred_score if succ_score > max_succ_score: max_succ_score = succ_score if (in_degree == 0) and (out_degree == 0): if found_intron: cutoff_score = trim_intron_fraction * max(max_pred_score, max_succ_score) trim_nodes.update(trim_intron(G, nodes, cutoff_score)) trim_nodes.update(trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction)) elif in_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_succ_score trim_nodes.update(trim_intronic_utr(G, nodes, cutoff_score)) trim_nodes.update(trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction)) elif out_degree == 0: if found_intron: cutoff_score = trim_intron_fraction * max_pred_score trim_nodes.update(trim_intronic_utr(G, nodes[::-1], cutoff_score)) trim_nodes.update(trim_utr(G, nodes, min_trim_length, trim_utr_fraction)) all_trim_nodes.update(trim_nodes) if len(all_trim_nodes) > 0: logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % (strand_int_to_str(strand), len(all_trim_nodes), len(G))) return all_trim_nodes