def consensus(lst): if len(lst) == 0: return None # first check for read through transcripts involving multiple # reference genes same_strand_hits = collections.defaultdict(lambda: []) for m in lst: category_int = Category.to_int(m.category) if category_int == Category.SAME_STRAND: same_strand_hits[m.ref_gene_id].append(m) # no same strand matches so don't need to worry about # read-throughs or multiple gene types if len(same_strand_hits) == 0: return MatchStats.choose_best(lst) # get consensus match from same strand overlapping genes total_introns = lst[0].num_introns total_length = lst[0].length shared_introns = 0 shared_same_strand_bp = 0 hits = [] for genelst in same_strand_hits.itervalues(): m = MatchStats.choose_best(genelst).copy() m.ref_gene_type = ','.join( sorted(set(m.ref_gene_type for m in genelst))) total_introns += m.ref_num_introns total_length += m.ref_length shared_introns += m.shared_introns shared_same_strand_bp += m.shared_same_strand_bp hits.append(m) # sort reference genes by position hits = MatchStats.sort_genome(hits) # make a new MatchStats object hit = hits[0].copy() hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits) hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits) hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits) hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits) hit.ref_source = ','.join(x.ref_source for x in hits) hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits) hit.ref_locus = ','.join(x.ref_locus for x in hits) hit.ref_length = ','.join(str(x.ref_length) for x in hits) hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits) hit.shared_same_strand_bp = shared_same_strand_bp hit.shared_opp_strand_bp = 0 hit.shared_introns = shared_introns hit.shared_splicing = any(m.shared_splicing for m in hits) hit.distance = 0 if len(same_strand_hits) > 1: hit.category = Category.to_str(Category.READ_THROUGH) return hit
def consensus(lst): if len(lst) == 0: return None # first check for read through transcripts involving multiple # reference genes same_strand_hits = collections.defaultdict(lambda: []) for m in lst: category_int = Category.to_int(m.category) if category_int == Category.SAME_STRAND: same_strand_hits[m.ref_gene_id].append(m) # no same strand matches so don't need to worry about # read-throughs or multiple gene types if len(same_strand_hits) == 0: return MatchStats.choose_best(lst) # get consensus match from same strand overlapping genes total_introns = lst[0].num_introns total_length = lst[0].length shared_introns = 0 shared_same_strand_bp = 0 hits = [] for genelst in same_strand_hits.itervalues(): m = MatchStats.choose_best(genelst).copy() m.ref_gene_type = ','.join(sorted(set(m.ref_gene_type for m in genelst))) total_introns += m.ref_num_introns total_length += m.ref_length shared_introns += m.shared_introns shared_same_strand_bp += m.shared_same_strand_bp hits.append(m) # sort reference genes by position hits = MatchStats.sort_genome(hits) # make a new MatchStats object hit = hits[0].copy() hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits) hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits) hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits) hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits) hit.ref_source = ','.join(x.ref_source for x in hits) hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits) hit.ref_locus = ','.join(x.ref_locus for x in hits) hit.ref_length = ','.join(str(x.ref_length) for x in hits) hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits) hit.shared_same_strand_bp = shared_same_strand_bp hit.shared_opp_strand_bp = 0 hit.shared_introns = shared_introns hit.shared_splicing = any(m.shared_splicing for m in hits) hit.distance = 0 if len(same_strand_hits) > 1: hit.category = Category.to_str(Category.READ_THROUGH) return hit
def choose_best(lst): hits = [] for m in lst: total_introns = m.num_introns + m.ref_num_introns if total_introns == 0: intron_frac = 0.0 else: intron_frac = float( m.shared_introns) / (total_introns - m.shared_introns) same_strand_frac = float(m.shared_same_strand_bp) / ( m.length + m.ref_length - m.shared_same_strand_bp) opp_strand_frac = float(m.shared_opp_strand_bp) / ( m.length + m.ref_length - m.shared_opp_strand_bp) category_int = Category.to_int(m.category) hits.append( (int(m.shared_splicing), intron_frac, same_strand_frac, opp_strand_frac, int(category_int == Category.INTRONIC_SAME_STRAND), int(category_int == Category.INTRONIC_OPP_STRAND), int(category_int == Category.INTERLEAVING_SAME_STRAND), int(category_int == Category.INTERLEAVING_OPP_STRAND), int(category_int == Category.ENCOMPASSING_SAME_STRAND), int(category_int == Category.ENCOMPASSING_OPP_STRAND), int(category_int == Category.INTERGENIC), -abs(m.distance), m)) # sort matches hits.sort(reverse=True) hit = hits[0][-1] return hit
def choose_best(lst): hits = [] for m in lst: total_introns = m.num_introns + m.ref_num_introns if total_introns == 0: intron_frac = 0.0 else: intron_frac = float(m.shared_introns) / (total_introns - m.shared_introns) same_strand_frac = float(m.shared_same_strand_bp) / (m.length + m.ref_length - m.shared_same_strand_bp) opp_strand_frac = float(m.shared_opp_strand_bp) / (m.length + m.ref_length - m.shared_opp_strand_bp) category_int = Category.to_int(m.category) hits.append((int(m.shared_splicing), intron_frac, same_strand_frac, opp_strand_frac, int(category_int == Category.INTRONIC_SAME_STRAND), int(category_int == Category.INTRONIC_OPP_STRAND), int(category_int == Category.INTERLEAVING_SAME_STRAND), int(category_int == Category.INTERLEAVING_OPP_STRAND), int(category_int == Category.ENCOMPASSING_SAME_STRAND), int(category_int == Category.ENCOMPASSING_OPP_STRAND), int(category_int == Category.INTERGENIC), -abs(m.distance), m)) # sort matches hits.sort(reverse=True) hit = hits[0][-1] return hit
def impute_transcript(t, gene_map, transcript_map): catstr = t.attrs['category'] catint = Category.to_int(catstr) length = t.length gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] # ref_gene_type can be be multiple gene types separated by commas. # convert into a set of unique gene types ref_gene_types = set(ref_gene_type.split(',')) transcript_types = set( impute_transcript_type(catint, length, gene_type, x) for x in ref_gene_types) transcript_categories = set(GENCODE_CATEGORY_MAP[x] for x in transcript_types) # sorted and join unique types/categories to make conglomerated category assignments transcript_type = ','.join(sorted(transcript_types)) transcript_category = ','.join(sorted(transcript_categories)) # use first gene in read-through for name #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0] # hyphenate read-through genes into long name ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(',')) # resolve upper/lower case issue with gene names from # different databases transcript_name = ref_gene_name.upper() # build transcript name if transcript_name == 'NONE': transcript_name = str(t.chrom) # append category if catint != Category.SAME_STRAND: transcript_name = '%s.%s' % (transcript_name, catstr) # transcript name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[transcript_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # append gene integers to name transcript_name = '%s.%d' % (transcript_name, gene_num) # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[transcript_name] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name transcript_name = '%s.%d' % (transcript_name, t_num) return transcript_type, transcript_category, transcript_name
def impute_transcript(t, gene_map, transcript_map): catstr = t.attrs['category'] catint = Category.to_int(catstr) length = t.length gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] # ref_gene_type can be be multiple gene types separated by commas. # convert into a set of unique gene types ref_gene_types = set(ref_gene_type.split(',')) transcript_types = set(impute_transcript_type(catint, length, gene_type, x) for x in ref_gene_types) transcript_categories = set(GENCODE_CATEGORY_MAP[x] for x in transcript_types) # sorted and join unique types/categories to make conglomerated category assignments transcript_type = ','.join(sorted(transcript_types)) transcript_category = ','.join(sorted(transcript_categories)) # use first gene in read-through for name #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0] # hyphenate read-through genes into long name ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(',')) # resolve upper/lower case issue with gene names from # different databases transcript_name = ref_gene_name.upper() # build transcript name if transcript_name == 'NONE': transcript_name = str(t.chrom) # append category if catint != Category.SAME_STRAND: transcript_name = '%s.%s' % (transcript_name, catstr) # transcript name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[transcript_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # append gene integers to name transcript_name = '%s.%d' % (transcript_name, gene_num) # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[transcript_name] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name transcript_name = '%s.%d' % (transcript_name, t_num) return transcript_type, transcript_category, transcript_name
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--rename', dest='rename', action='store_true') parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file rename = args.rename if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: catstr = t.attrs['category'] catint = Category.to_int(catstr) gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] if catint == Category.SAME_STRAND: # impute gene type new_gene_type = ref_gene_type else: if gene_type == 'protein_coding': # don't change protein coding genes new_gene_type = gene_type elif t.length < 250: # categorize small RNA separately new_gene_type = 'misc_RNA' else: if ref_gene_type == 'protein_coding': # categorize based on overlap with reference new_gene_type = PROTEIN_CATEGORY_MAP[catint] else: # reference is also non-coding new_gene_type = 'lincRNA' # get gene category gene_category = GENCODE_CATEGORY_MAP[new_gene_type] new_gene_name = None if rename: # resolve upper/lower case issue with gene names from # different databases ref_gene_name = t.attrs['ref_gene_name'].upper() # build new gene name if ref_gene_name == 'NONE': new_gene_name = str(t.attrs['source']) elif catint == Category.SAME_STRAND: new_gene_name = str(ref_gene_name) else: new_gene_name = '%s.%s' % (ref_gene_name, catstr) # gene name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[new_gene_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[gene_id] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['gene_type'] = new_gene_type f.attrs['gene_category'] = gene_category if rename: if 'gene_name' in f.attrs: f.attrs['orig_gene_name'] = f.attrs['gene_name'] f.attrs['gene_name'] = new_gene_name print str(f) num_transcripts += 1 return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): # output files if not os.path.exists(output_dir): logging.info('Creating output dir: %s' % (output_dir)) os.makedirs(output_dir) # merge step merged_gtf_file = os.path.join(output_dir, "merged.gtf") merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf" merge_done_file = os.path.join(output_dir, 'merged.done') sort_done_file = os.path.join(output_dir, 'sort.done') if not os.path.exists(merge_done_file): # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files with open(merged_gtf_file, "w") as fileh: logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, fileh, is_ref=True) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, fileh, is_ref=False) open(merge_done_file, 'w').close() if not os.path.exists(sort_done_file): logging.info("Sorting merged GTF file") # create temp directory tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir) # cleanup shutil.rmtree(tmp_dir) open(sort_done_file, 'w').close() # compare assemblies overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf') intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf') overlapping_file = os.path.join(output_dir, 'overlapping.tsv') overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv') overlapping_done_file = os.path.join(output_dir, 'overlapping.done') stats_file = os.path.join(output_dir, 'stats.txt') stats_obj = GlobalStats() num_intergenic = 0 if not os.path.exists(overlapping_done_file): logging.info("Comparing assemblies") gtf_fileh = open(overlapping_gtf_file, 'w') tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w') overlapping_fileh = open(overlapping_file, 'w') overlapping_consensus_fileh = open(overlapping_consensus_file, 'w') for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug( "[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for t, match_stats in compare_locus(locus_transcripts): if len(match_stats) == 0: # write intergenic transcripts to analyze separately t.attrs['category'] = Category.to_str(Category.INTERGENIC) for f in t.to_gtf_features(source='assembly'): print >> tmp_gtf_fileh, str(f) num_intergenic += 1 else: # get consensus match information consensus_match = MatchStats.consensus(match_stats) assert consensus_match is not None t.attrs['category'] = consensus_match.category # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): consensus_match.add_gtf_attributes(f) print >> gtf_fileh, str(f) # tab-delimited text output print >> overlapping_consensus_fileh, str(consensus_match) for ms in match_stats: print >> overlapping_fileh, str(ms) # compute global statistics stats_obj.compute(locus_transcripts) logging.info("Reporting global statistics") with open(stats_file, 'w') as f: print >> f, stats_obj.report() gtf_fileh.close() tmp_gtf_fileh.close() overlapping_fileh.close() overlapping_consensus_fileh.close() open(overlapping_done_file, 'w').close() # resolve intergenic transcripts intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf') intergenic_file = os.path.join(output_dir, 'intergenic.tsv') intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv') intergenic_done_file = os.path.join(output_dir, 'intergenic.done') if not os.path.exists(intergenic_done_file): logging.info("Building interval index") locus_trees = build_locus_trees(merged_sorted_gtf_file) logging.info('Finding nearest matches to intergenic transcripts') gtf_fileh = open(intergenic_gtf_file, 'w') intergenic_fileh = open(intergenic_file, 'w') intergenic_best_fileh = open(intergenic_best_file, 'w') for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)): for t in locus_transcripts: # find nearest transcripts nearest_transcripts = find_nearest_transcripts( t.chrom, t.start, t.end, t.strand, locus_trees) match_stats = [] best_match = None if len(nearest_transcripts) == 0: best_match = MatchStats.from_transcript(t) best_match.category = Category.to_str(Category.INTERGENIC) match_stats.append(best_match) else: for ref, category, dist in nearest_transcripts: # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = 0 ms.shared_opp_strand_bp = 0 ms.shared_introns = 0 ms.shared_splicing = False ms.category = Category.to_str(category) ms.distance = dist match_stats.append(ms) # choose the consensus match best_match = MatchStats.choose_best(match_stats) # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): best_match.add_gtf_attributes(f) print >> gtf_fileh, str(f) # write tab-delimited data print >> intergenic_best_fileh, str(best_match) for ms in match_stats: print >> intergenic_fileh, str(ms) gtf_fileh.close() intergenic_fileh.close() intergenic_best_fileh.close() open(intergenic_done_file, 'w').close() # merge overlapping and intergenic results logging.info('Merging results') metadata_file = os.path.join(output_dir, 'metadata.txt') metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt') assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf') combine_done_file = os.path.join(output_dir, 'done') if not os.path.exists(combine_done_file): filenames = [overlapping_file, intergenic_file] with open(metadata_file, 'w') as outfile: print >> outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [overlapping_consensus_file, intergenic_best_file] with open(metadata_consensus_file, 'w') as outfile: print >> outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [intergenic_gtf_file, overlapping_gtf_file] with open(assembly_gtf_file, 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) open(combine_done_file, 'w').close() # cleanup logging.info("Done")
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start, end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval( Interval(start, end, strand=strand, value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start, end in t.iterintrons(): introns.append((t.strand, start, end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum( (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len( m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len( m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--rename', dest='rename', action='store_true') parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file rename = args.rename if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: catstr = t.attrs['category'] catint = Category.to_int(catstr) gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] if catint == Category.SAME_STRAND: # impute gene type new_gene_type = ref_gene_type else: if gene_type == 'protein_coding': # don't change protein coding genes new_gene_type = gene_type elif t.length < 250: # categorize small RNA separately new_gene_type = 'misc_RNA' else: if ref_gene_type == 'protein_coding': # categorize based on overlap with reference new_gene_type = PROTEIN_CATEGORY_MAP[catint] else: # reference is also non-coding new_gene_type = 'lincRNA' # get gene category gene_category = GENCODE_CATEGORY_MAP[new_gene_type] new_gene_name = None if rename: # resolve upper/lower case issue with gene names from # different databases ref_gene_name = t.attrs['ref_gene_name'].upper() # build new gene name if ref_gene_name == 'NONE': new_gene_name = str(t.attrs['source']) elif catint == Category.SAME_STRAND: new_gene_name = str(ref_gene_name) else: new_gene_name = '%s.%s' % (ref_gene_name, catstr) # gene name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[new_gene_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[gene_id] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['gene_type'] = new_gene_type f.attrs['gene_category'] = gene_category if rename: if 'gene_name' in f.attrs: f.attrs['orig_gene_name'] = f.attrs['gene_name'] f.attrs['gene_name'] = new_gene_name print str(f) num_transcripts += 1 return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): # output files if not os.path.exists(output_dir): logging.info('Creating output dir: %s' % (output_dir)) os.makedirs(output_dir) # merge step merged_gtf_file = os.path.join(output_dir, "merged.gtf") merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf" merge_done_file = os.path.join(output_dir, 'merged.done') sort_done_file = os.path.join(output_dir, 'sort.done') if not os.path.exists(merge_done_file): # merge and sort ref/test gtf files logging.info("Merging reference and test GTF files") # make temporary file to store merged ref/test gtf files with open(merged_gtf_file, "w") as fileh: logging.info("Adding reference GTF file") add_gtf_file(ref_gtf_file, fileh, is_ref=True) logging.info("Adding test GTF file") add_gtf_file(test_gtf_file, fileh, is_ref=False) open(merge_done_file, 'w').close() if not os.path.exists(sort_done_file): logging.info("Sorting merged GTF file") # create temp directory tmp_dir = os.path.join(output_dir, 'tmp') if not os.path.exists(tmp_dir): logging.debug("Creating tmp directory '%s'" % (tmp_dir)) os.makedirs(tmp_dir) sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir) # cleanup shutil.rmtree(tmp_dir) open(sort_done_file, 'w').close() # compare assemblies overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf') intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf') overlapping_file = os.path.join(output_dir, 'overlapping.tsv') overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv') overlapping_done_file = os.path.join(output_dir, 'overlapping.done') stats_file = os.path.join(output_dir, 'stats.txt') stats_obj = GlobalStats() num_intergenic = 0 if not os.path.exists(overlapping_done_file): logging.info("Comparing assemblies") gtf_fileh = open(overlapping_gtf_file, 'w') tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w') overlapping_fileh = open(overlapping_file, 'w') overlapping_consensus_fileh = open(overlapping_consensus_file, 'w') for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)): locus_chrom = locus_transcripts[0].chrom locus_start = locus_transcripts[0].start locus_end = max(t.end for t in locus_transcripts) logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(locus_transcripts))) for t, match_stats in compare_locus(locus_transcripts): if len(match_stats) == 0: # write intergenic transcripts to analyze separately t.attrs['category'] = Category.to_str(Category.INTERGENIC) for f in t.to_gtf_features(source='assembly'): print >>tmp_gtf_fileh, str(f) num_intergenic += 1 else: # get consensus match information consensus_match = MatchStats.consensus(match_stats) assert consensus_match is not None t.attrs['category'] = consensus_match.category # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): consensus_match.add_gtf_attributes(f) print >>gtf_fileh, str(f) # tab-delimited text output print >>overlapping_consensus_fileh, str(consensus_match) for ms in match_stats: print >>overlapping_fileh, str(ms) # compute global statistics stats_obj.compute(locus_transcripts) logging.info("Reporting global statistics") with open(stats_file, 'w') as f: print >>f, stats_obj.report() gtf_fileh.close() tmp_gtf_fileh.close() overlapping_fileh.close() overlapping_consensus_fileh.close() open(overlapping_done_file, 'w').close() # resolve intergenic transcripts intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf') intergenic_file = os.path.join(output_dir, 'intergenic.tsv') intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv') intergenic_done_file = os.path.join(output_dir, 'intergenic.done') if not os.path.exists(intergenic_done_file): logging.info("Building interval index") locus_trees = build_locus_trees(merged_sorted_gtf_file) logging.info('Finding nearest matches to intergenic transcripts') gtf_fileh = open(intergenic_gtf_file, 'w') intergenic_fileh = open(intergenic_file, 'w') intergenic_best_fileh = open(intergenic_best_file, 'w') for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)): for t in locus_transcripts: # find nearest transcripts nearest_transcripts = find_nearest_transcripts(t.chrom, t.start, t.end, t.strand, locus_trees) match_stats = [] best_match = None if len(nearest_transcripts) == 0: best_match = MatchStats.from_transcript(t) best_match.category = Category.to_str(Category.INTERGENIC) match_stats.append(best_match) else: for ref,category,dist in nearest_transcripts: # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = 0 ms.shared_opp_strand_bp = 0 ms.shared_introns = 0 ms.shared_splicing = False ms.category = Category.to_str(category) ms.distance = dist match_stats.append(ms) # choose the consensus match best_match = MatchStats.choose_best(match_stats) # add gtf attributes and write for f in t.to_gtf_features(source='assembly'): best_match.add_gtf_attributes(f) print >>gtf_fileh, str(f) # write tab-delimited data print >>intergenic_best_fileh, str(best_match) for ms in match_stats: print >>intergenic_fileh, str(ms) gtf_fileh.close() intergenic_fileh.close() intergenic_best_fileh.close() open(intergenic_done_file, 'w').close() # merge overlapping and intergenic results logging.info('Merging results') metadata_file = os.path.join(output_dir, 'metadata.txt') metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt') assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf') combine_done_file = os.path.join(output_dir, 'done') if not os.path.exists(combine_done_file): filenames = [overlapping_file, intergenic_file] with open(metadata_file, 'w') as outfile: print >>outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [overlapping_consensus_file, intergenic_best_file] with open(metadata_consensus_file, 'w') as outfile: print >>outfile, '\t'.join(MatchStats.header_fields()) for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) filenames = [intergenic_gtf_file, overlapping_gtf_file] with open(assembly_gtf_file, 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line) open(combine_done_file, 'w').close() # cleanup logging.info("Done")
def compare_locus(transcripts): # store reference introns # (strand,start,end) -> ids (set) ref_intron_dict = collections.defaultdict(lambda: []) ref_node_dict = collections.defaultdict(lambda: []) ref_splicing_patterns = collections.defaultdict(lambda: []) ref_dict = {} # find the intron domains of the transcripts boundaries = find_exon_boundaries(transcripts) test_transcripts = [] for t in transcripts: # separate ref and nonref transcripts is_ref = bool(int(t.attrs[GTFAttr.REF])) if is_ref: # add to dict ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID] ref_dict[ref_id] = t # split exons that cross boundaries and get the # nodes in the transcript path for n in split_exons(t, boundaries): ref_node_dict[n].append(t) # add to introns splicing_pattern = [] for start,end in t.iterintrons(): intron = (t.strand, start, end) ref_intron_dict[intron].append(t) splicing_pattern.append(intron) # add to splicing patterns if len(splicing_pattern) > 0: ref_splicing_patterns[tuple(splicing_pattern)].append(t) else: test_transcripts.append(t) # index introns for fast intersection intron_tree = IntervalTree() for intron, refs in ref_intron_dict.iteritems(): strand, start, end = intron intron_tree.insert_interval(Interval(start,end,strand=strand,value=refs)) # categorize transcripts for t in test_transcripts: # get transcript nodes and introns nodes = list(split_exons(t, boundaries)) introns = [] for start,end in t.iterintrons(): introns.append((t.strand,start,end)) splicing_pattern = tuple(introns) # keep list of all matching ref transcripts matches = collections.defaultdict(lambda: Match()) # dict of reference transcripts -> category -> list of nodes for n in nodes: if n in ref_node_dict: # look for reference transcripts that share this node for ref in ref_node_dict[n]: if cmp_strand(t.strand, ref.strand): c = Category.SAME_STRAND else: c = Category.OPP_STRAND ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # look for reference introns that overlap this node for hit in intron_tree.find(*n): if cmp_strand(t.strand, hit.strand): c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND for ref in hit.value: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.nodes[c].append(n) # dict of introns -> list of reference transcripts for intron in introns: if intron in ref_intron_dict: for ref in ref_intron_dict[intron]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.introns.append(intron) # check splicing pattern matches if len(splicing_pattern) > 0: if splicing_pattern in ref_splicing_patterns: for ref in ref_splicing_patterns[splicing_pattern]: ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID] m = matches[ref_id] m.splicing = True # go through the matches for this transcript and determine # the transcript category match_stats = [] for ref_id, m in matches.iteritems(): ref = ref_dict[ref_id] # calculate coverage same_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND]) opp_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND]) # count shared introns num_shared_introns = len(m.introns) # decide category for this test/ref transcript pair if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0): c = Category.SAME_STRAND elif (opp_strand_bp > 0): c = Category.OPP_STRAND else: # count nodes of different types num_same_strand = len(m.nodes[Category.SAME_STRAND]) num_opp_strand = len(m.nodes[Category.OPP_STRAND]) num_intronic_same_strand = len(m.nodes[Category.INTRONIC_SAME_STRAND]) num_intronic_opp_strand = len(m.nodes[Category.INTRONIC_OPP_STRAND]) assert num_same_strand == 0 assert num_opp_strand == 0 num_intronic = (num_intronic_same_strand + num_intronic_opp_strand) assert num_intronic > 0 if (num_intronic == len(nodes)): # completely intronic if num_intronic_same_strand > 0: c = Category.INTRONIC_SAME_STRAND else: c = Category.INTRONIC_OPP_STRAND else: # interleaving means some nodes intronic and other intergenic if num_intronic_same_strand > 0: c = Category.INTERLEAVING_SAME_STRAND else: c = Category.INTERLEAVING_OPP_STRAND # create a match object ms = MatchStats.from_transcript(t, ref) ms.shared_same_strand_bp = same_strand_bp ms.shared_opp_strand_bp = opp_strand_bp ms.shared_introns = num_shared_introns ms.shared_splicing = m.splicing ms.category = Category.to_str(c) ms.distance = 0 match_stats.append(ms) yield (t, match_stats)