def choose_best(lst): hits = [] for m in lst: total_introns = m.num_introns + m.ref_num_introns if total_introns == 0: intron_frac = 0.0 else: intron_frac = float( m.shared_introns) / (total_introns - m.shared_introns) same_strand_frac = float(m.shared_same_strand_bp) / ( m.length + m.ref_length - m.shared_same_strand_bp) opp_strand_frac = float(m.shared_opp_strand_bp) / ( m.length + m.ref_length - m.shared_opp_strand_bp) category_int = Category.to_int(m.category) hits.append( (int(m.shared_splicing), intron_frac, same_strand_frac, opp_strand_frac, int(category_int == Category.INTRONIC_SAME_STRAND), int(category_int == Category.INTRONIC_OPP_STRAND), int(category_int == Category.INTERLEAVING_SAME_STRAND), int(category_int == Category.INTERLEAVING_OPP_STRAND), int(category_int == Category.ENCOMPASSING_SAME_STRAND), int(category_int == Category.ENCOMPASSING_OPP_STRAND), int(category_int == Category.INTERGENIC), -abs(m.distance), m)) # sort matches hits.sort(reverse=True) hit = hits[0][-1] return hit
def choose_best(lst): hits = [] for m in lst: total_introns = m.num_introns + m.ref_num_introns if total_introns == 0: intron_frac = 0.0 else: intron_frac = float(m.shared_introns) / (total_introns - m.shared_introns) same_strand_frac = float(m.shared_same_strand_bp) / (m.length + m.ref_length - m.shared_same_strand_bp) opp_strand_frac = float(m.shared_opp_strand_bp) / (m.length + m.ref_length - m.shared_opp_strand_bp) category_int = Category.to_int(m.category) hits.append((int(m.shared_splicing), intron_frac, same_strand_frac, opp_strand_frac, int(category_int == Category.INTRONIC_SAME_STRAND), int(category_int == Category.INTRONIC_OPP_STRAND), int(category_int == Category.INTERLEAVING_SAME_STRAND), int(category_int == Category.INTERLEAVING_OPP_STRAND), int(category_int == Category.ENCOMPASSING_SAME_STRAND), int(category_int == Category.ENCOMPASSING_OPP_STRAND), int(category_int == Category.INTERGENIC), -abs(m.distance), m)) # sort matches hits.sort(reverse=True) hit = hits[0][-1] return hit
def impute_transcript(t, gene_map, transcript_map): catstr = t.attrs['category'] catint = Category.to_int(catstr) length = t.length gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] # ref_gene_type can be be multiple gene types separated by commas. # convert into a set of unique gene types ref_gene_types = set(ref_gene_type.split(',')) transcript_types = set( impute_transcript_type(catint, length, gene_type, x) for x in ref_gene_types) transcript_categories = set(GENCODE_CATEGORY_MAP[x] for x in transcript_types) # sorted and join unique types/categories to make conglomerated category assignments transcript_type = ','.join(sorted(transcript_types)) transcript_category = ','.join(sorted(transcript_categories)) # use first gene in read-through for name #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0] # hyphenate read-through genes into long name ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(',')) # resolve upper/lower case issue with gene names from # different databases transcript_name = ref_gene_name.upper() # build transcript name if transcript_name == 'NONE': transcript_name = str(t.chrom) # append category if catint != Category.SAME_STRAND: transcript_name = '%s.%s' % (transcript_name, catstr) # transcript name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[transcript_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # append gene integers to name transcript_name = '%s.%d' % (transcript_name, gene_num) # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[transcript_name] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name transcript_name = '%s.%d' % (transcript_name, t_num) return transcript_type, transcript_category, transcript_name
def consensus(lst): if len(lst) == 0: return None # first check for read through transcripts involving multiple # reference genes same_strand_hits = collections.defaultdict(lambda: []) for m in lst: category_int = Category.to_int(m.category) if category_int == Category.SAME_STRAND: same_strand_hits[m.ref_gene_id].append(m) # no same strand matches so don't need to worry about # read-throughs or multiple gene types if len(same_strand_hits) == 0: return MatchStats.choose_best(lst) # get consensus match from same strand overlapping genes total_introns = lst[0].num_introns total_length = lst[0].length shared_introns = 0 shared_same_strand_bp = 0 hits = [] for genelst in same_strand_hits.itervalues(): m = MatchStats.choose_best(genelst).copy() m.ref_gene_type = ','.join( sorted(set(m.ref_gene_type for m in genelst))) total_introns += m.ref_num_introns total_length += m.ref_length shared_introns += m.shared_introns shared_same_strand_bp += m.shared_same_strand_bp hits.append(m) # sort reference genes by position hits = MatchStats.sort_genome(hits) # make a new MatchStats object hit = hits[0].copy() hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits) hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits) hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits) hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits) hit.ref_source = ','.join(x.ref_source for x in hits) hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits) hit.ref_locus = ','.join(x.ref_locus for x in hits) hit.ref_length = ','.join(str(x.ref_length) for x in hits) hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits) hit.shared_same_strand_bp = shared_same_strand_bp hit.shared_opp_strand_bp = 0 hit.shared_introns = shared_introns hit.shared_splicing = any(m.shared_splicing for m in hits) hit.distance = 0 if len(same_strand_hits) > 1: hit.category = Category.to_str(Category.READ_THROUGH) return hit
def impute_transcript(t, gene_map, transcript_map): catstr = t.attrs['category'] catint = Category.to_int(catstr) length = t.length gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] # ref_gene_type can be be multiple gene types separated by commas. # convert into a set of unique gene types ref_gene_types = set(ref_gene_type.split(',')) transcript_types = set(impute_transcript_type(catint, length, gene_type, x) for x in ref_gene_types) transcript_categories = set(GENCODE_CATEGORY_MAP[x] for x in transcript_types) # sorted and join unique types/categories to make conglomerated category assignments transcript_type = ','.join(sorted(transcript_types)) transcript_category = ','.join(sorted(transcript_categories)) # use first gene in read-through for name #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0] # hyphenate read-through genes into long name ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(',')) # resolve upper/lower case issue with gene names from # different databases transcript_name = ref_gene_name.upper() # build transcript name if transcript_name == 'NONE': transcript_name = str(t.chrom) # append category if catint != Category.SAME_STRAND: transcript_name = '%s.%s' % (transcript_name, catstr) # transcript name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[transcript_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # append gene integers to name transcript_name = '%s.%d' % (transcript_name, gene_num) # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[transcript_name] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name transcript_name = '%s.%d' % (transcript_name, t_num) return transcript_type, transcript_category, transcript_name
def consensus(lst): if len(lst) == 0: return None # first check for read through transcripts involving multiple # reference genes same_strand_hits = collections.defaultdict(lambda: []) for m in lst: category_int = Category.to_int(m.category) if category_int == Category.SAME_STRAND: same_strand_hits[m.ref_gene_id].append(m) # no same strand matches so don't need to worry about # read-throughs or multiple gene types if len(same_strand_hits) == 0: return MatchStats.choose_best(lst) # get consensus match from same strand overlapping genes total_introns = lst[0].num_introns total_length = lst[0].length shared_introns = 0 shared_same_strand_bp = 0 hits = [] for genelst in same_strand_hits.itervalues(): m = MatchStats.choose_best(genelst).copy() m.ref_gene_type = ','.join(sorted(set(m.ref_gene_type for m in genelst))) total_introns += m.ref_num_introns total_length += m.ref_length shared_introns += m.shared_introns shared_same_strand_bp += m.shared_same_strand_bp hits.append(m) # sort reference genes by position hits = MatchStats.sort_genome(hits) # make a new MatchStats object hit = hits[0].copy() hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits) hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits) hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits) hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits) hit.ref_source = ','.join(x.ref_source for x in hits) hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits) hit.ref_locus = ','.join(x.ref_locus for x in hits) hit.ref_length = ','.join(str(x.ref_length) for x in hits) hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits) hit.shared_same_strand_bp = shared_same_strand_bp hit.shared_opp_strand_bp = 0 hit.shared_introns = shared_introns hit.shared_splicing = any(m.shared_splicing for m in hits) hit.distance = 0 if len(same_strand_hits) > 1: hit.category = Category.to_str(Category.READ_THROUGH) return hit
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--rename', dest='rename', action='store_true') parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file rename = args.rename if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: catstr = t.attrs['category'] catint = Category.to_int(catstr) gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] if catint == Category.SAME_STRAND: # impute gene type new_gene_type = ref_gene_type else: if gene_type == 'protein_coding': # don't change protein coding genes new_gene_type = gene_type elif t.length < 250: # categorize small RNA separately new_gene_type = 'misc_RNA' else: if ref_gene_type == 'protein_coding': # categorize based on overlap with reference new_gene_type = PROTEIN_CATEGORY_MAP[catint] else: # reference is also non-coding new_gene_type = 'lincRNA' # get gene category gene_category = GENCODE_CATEGORY_MAP[new_gene_type] new_gene_name = None if rename: # resolve upper/lower case issue with gene names from # different databases ref_gene_name = t.attrs['ref_gene_name'].upper() # build new gene name if ref_gene_name == 'NONE': new_gene_name = str(t.attrs['source']) elif catint == Category.SAME_STRAND: new_gene_name = str(ref_gene_name) else: new_gene_name = '%s.%s' % (ref_gene_name, catstr) # gene name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[new_gene_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[gene_id] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['gene_type'] = new_gene_type f.attrs['gene_category'] = gene_category if rename: if 'gene_name' in f.attrs: f.attrs['orig_gene_name'] = f.attrs['gene_name'] f.attrs['gene_name'] = new_gene_name print str(f) num_transcripts += 1 return 0
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--rename', dest='rename', action='store_true') parser.add_argument('gtf_file') args = parser.parse_args() gtf_file = args.gtf_file rename = args.rename if not os.path.exists(gtf_file): parser.error("GTF file '%s' not found" % (gtf_file)) # parse transcripts num_transcripts = 0 # keep track of redundant gene/transcript counts gene_map = collections.defaultdict(lambda: {}) transcript_map = collections.defaultdict(lambda: {}) for transcripts in parse_gtf(open(gtf_file)): for t in transcripts: catstr = t.attrs['category'] catint = Category.to_int(catstr) gene_type = t.attrs.get('gene_type', None) ref_gene_type = t.attrs['ref_gene_type'] if catint == Category.SAME_STRAND: # impute gene type new_gene_type = ref_gene_type else: if gene_type == 'protein_coding': # don't change protein coding genes new_gene_type = gene_type elif t.length < 250: # categorize small RNA separately new_gene_type = 'misc_RNA' else: if ref_gene_type == 'protein_coding': # categorize based on overlap with reference new_gene_type = PROTEIN_CATEGORY_MAP[catint] else: # reference is also non-coding new_gene_type = 'lincRNA' # get gene category gene_category = GENCODE_CATEGORY_MAP[new_gene_type] new_gene_name = None if rename: # resolve upper/lower case issue with gene names from # different databases ref_gene_name = t.attrs['ref_gene_name'].upper() # build new gene name if ref_gene_name == 'NONE': new_gene_name = str(t.attrs['source']) elif catint == Category.SAME_STRAND: new_gene_name = str(ref_gene_name) else: new_gene_name = '%s.%s' % (ref_gene_name, catstr) # gene name string is key to a dictionary that # associates each gene id with an integer number gene_id = t.attrs['gene_id'] gene_dict = gene_map[new_gene_name] if gene_id not in gene_dict: gene_num = len(gene_dict) + 1 gene_dict[gene_id] = gene_num else: gene_num = gene_dict[gene_id] # gene id is also key to dict that associates each isoform # of gene with integer number t_id = t.attrs['transcript_id'] t_dict = transcript_map[gene_id] if t_id not in t_dict: t_num = len(t_dict) + 1 t_dict[t_id] = t_num else: t_num = t_dict[t_id] # append gene/transcript integers to gene name new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num) # write new attributes for f in t.to_gtf_features(source='assemblyline', score=1000): f.attrs['gene_type'] = new_gene_type f.attrs['gene_category'] = gene_category if rename: if 'gene_name' in f.attrs: f.attrs['orig_gene_name'] = f.attrs['gene_name'] f.attrs['gene_name'] = new_gene_name print str(f) num_transcripts += 1 return 0