Exemplo n.º 1
0
 def choose_best(lst):
     hits = []
     for m in lst:
         total_introns = m.num_introns + m.ref_num_introns
         if total_introns == 0:
             intron_frac = 0.0
         else:
             intron_frac = float(
                 m.shared_introns) / (total_introns - m.shared_introns)
         same_strand_frac = float(m.shared_same_strand_bp) / (
             m.length + m.ref_length - m.shared_same_strand_bp)
         opp_strand_frac = float(m.shared_opp_strand_bp) / (
             m.length + m.ref_length - m.shared_opp_strand_bp)
         category_int = Category.to_int(m.category)
         hits.append(
             (int(m.shared_splicing), intron_frac, same_strand_frac,
              opp_strand_frac,
              int(category_int == Category.INTRONIC_SAME_STRAND),
              int(category_int == Category.INTRONIC_OPP_STRAND),
              int(category_int == Category.INTERLEAVING_SAME_STRAND),
              int(category_int == Category.INTERLEAVING_OPP_STRAND),
              int(category_int == Category.ENCOMPASSING_SAME_STRAND),
              int(category_int == Category.ENCOMPASSING_OPP_STRAND),
              int(category_int == Category.INTERGENIC), -abs(m.distance),
              m))
     # sort matches
     hits.sort(reverse=True)
     hit = hits[0][-1]
     return hit
Exemplo n.º 2
0
 def choose_best(lst):
     hits = []
     for m in lst:
         total_introns = m.num_introns + m.ref_num_introns
         if total_introns == 0:
             intron_frac = 0.0
         else:
             intron_frac = float(m.shared_introns) / (total_introns - m.shared_introns)
         same_strand_frac = float(m.shared_same_strand_bp) / (m.length + m.ref_length - m.shared_same_strand_bp)
         opp_strand_frac = float(m.shared_opp_strand_bp) / (m.length + m.ref_length - m.shared_opp_strand_bp)
         category_int = Category.to_int(m.category)
         hits.append((int(m.shared_splicing), intron_frac, 
                      same_strand_frac, opp_strand_frac, 
                      int(category_int == Category.INTRONIC_SAME_STRAND),
                      int(category_int == Category.INTRONIC_OPP_STRAND),                                 
                      int(category_int == Category.INTERLEAVING_SAME_STRAND),
                      int(category_int == Category.INTERLEAVING_OPP_STRAND),
                      int(category_int == Category.ENCOMPASSING_SAME_STRAND),
                      int(category_int == Category.ENCOMPASSING_OPP_STRAND),                                                         
                      int(category_int == Category.INTERGENIC),                                                         
                      -abs(m.distance), m))
     # sort matches
     hits.sort(reverse=True)
     hit = hits[0][-1]
     return hit
def impute_transcript(t, gene_map, transcript_map):
    catstr = t.attrs['category']
    catint = Category.to_int(catstr)
    length = t.length
    gene_type = t.attrs.get('gene_type', None)
    ref_gene_type = t.attrs['ref_gene_type']
    # ref_gene_type can be  be multiple gene types separated by commas.
    # convert into a set of unique gene types
    ref_gene_types = set(ref_gene_type.split(','))
    transcript_types = set(
        impute_transcript_type(catint, length, gene_type, x)
        for x in ref_gene_types)
    transcript_categories = set(GENCODE_CATEGORY_MAP[x]
                                for x in transcript_types)
    # sorted and join unique types/categories to make conglomerated category assignments
    transcript_type = ','.join(sorted(transcript_types))
    transcript_category = ','.join(sorted(transcript_categories))
    # use first gene in read-through for name
    #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0]
    # hyphenate read-through genes into long name
    ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(','))
    # resolve upper/lower case issue with gene names from
    # different databases
    transcript_name = ref_gene_name.upper()
    # build transcript name
    if transcript_name == 'NONE':
        transcript_name = str(t.chrom)
    # append category
    if catint != Category.SAME_STRAND:
        transcript_name = '%s.%s' % (transcript_name, catstr)
    # transcript name string is key to a dictionary that
    # associates each gene id with an integer number
    gene_id = t.attrs['gene_id']
    gene_dict = gene_map[transcript_name]
    if gene_id not in gene_dict:
        gene_num = len(gene_dict) + 1
        gene_dict[gene_id] = gene_num
    else:
        gene_num = gene_dict[gene_id]
    # append gene integers to name
    transcript_name = '%s.%d' % (transcript_name, gene_num)
    # gene id is also key to dict that associates each isoform
    # of gene with integer number
    t_id = t.attrs['transcript_id']
    t_dict = transcript_map[transcript_name]
    if t_id not in t_dict:
        t_num = len(t_dict) + 1
        t_dict[t_id] = t_num
    else:
        t_num = t_dict[t_id]
    # append gene/transcript integers to gene name
    transcript_name = '%s.%d' % (transcript_name, t_num)
    return transcript_type, transcript_category, transcript_name
Exemplo n.º 4
0
 def consensus(lst):
     if len(lst) == 0:
         return None
     # first check for read through transcripts involving multiple
     # reference genes
     same_strand_hits = collections.defaultdict(lambda: [])
     for m in lst:
         category_int = Category.to_int(m.category)
         if category_int == Category.SAME_STRAND:
             same_strand_hits[m.ref_gene_id].append(m)
     # no same strand matches so don't need to worry about
     # read-throughs or multiple gene types
     if len(same_strand_hits) == 0:
         return MatchStats.choose_best(lst)
     # get consensus match from same strand overlapping genes
     total_introns = lst[0].num_introns
     total_length = lst[0].length
     shared_introns = 0
     shared_same_strand_bp = 0
     hits = []
     for genelst in same_strand_hits.itervalues():
         m = MatchStats.choose_best(genelst).copy()
         m.ref_gene_type = ','.join(
             sorted(set(m.ref_gene_type for m in genelst)))
         total_introns += m.ref_num_introns
         total_length += m.ref_length
         shared_introns += m.shared_introns
         shared_same_strand_bp += m.shared_same_strand_bp
         hits.append(m)
     # sort reference genes by position
     hits = MatchStats.sort_genome(hits)
     # make a new MatchStats object
     hit = hits[0].copy()
     hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits)
     hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits)
     hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits)
     hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits)
     hit.ref_source = ','.join(x.ref_source for x in hits)
     hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits)
     hit.ref_locus = ','.join(x.ref_locus for x in hits)
     hit.ref_length = ','.join(str(x.ref_length) for x in hits)
     hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits)
     hit.shared_same_strand_bp = shared_same_strand_bp
     hit.shared_opp_strand_bp = 0
     hit.shared_introns = shared_introns
     hit.shared_splicing = any(m.shared_splicing for m in hits)
     hit.distance = 0
     if len(same_strand_hits) > 1:
         hit.category = Category.to_str(Category.READ_THROUGH)
     return hit
def impute_transcript(t, gene_map, transcript_map):
    catstr = t.attrs['category']
    catint = Category.to_int(catstr)
    length = t.length
    gene_type = t.attrs.get('gene_type', None)
    ref_gene_type = t.attrs['ref_gene_type']
    # ref_gene_type can be  be multiple gene types separated by commas. 
    # convert into a set of unique gene types
    ref_gene_types = set(ref_gene_type.split(','))
    transcript_types = set(impute_transcript_type(catint, length, gene_type, x) for x in ref_gene_types)
    transcript_categories = set(GENCODE_CATEGORY_MAP[x] for x in transcript_types)
    # sorted and join unique types/categories to make conglomerated category assignments 
    transcript_type = ','.join(sorted(transcript_types))
    transcript_category = ','.join(sorted(transcript_categories))
    # use first gene in read-through for name
    #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0]
    # hyphenate read-through genes into long name
    ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(','))
    # resolve upper/lower case issue with gene names from 
    # different databases
    transcript_name = ref_gene_name.upper()
    # build transcript name
    if transcript_name == 'NONE':
        transcript_name = str(t.chrom)
    # append category
    if catint != Category.SAME_STRAND:
        transcript_name = '%s.%s' % (transcript_name, catstr)
    # transcript name string is key to a dictionary that
    # associates each gene id with an integer number
    gene_id = t.attrs['gene_id']
    gene_dict = gene_map[transcript_name]
    if gene_id not in gene_dict:
        gene_num = len(gene_dict) + 1
        gene_dict[gene_id] = gene_num
    else:
        gene_num = gene_dict[gene_id]
    # append gene integers to name
    transcript_name = '%s.%d' % (transcript_name, gene_num)
    # gene id is also key to dict that associates each isoform
    # of gene with integer number
    t_id = t.attrs['transcript_id']
    t_dict = transcript_map[transcript_name]
    if t_id not in t_dict:
        t_num = len(t_dict) + 1
        t_dict[t_id] = t_num
    else:
        t_num = t_dict[t_id]
    # append gene/transcript integers to gene name
    transcript_name = '%s.%d' % (transcript_name, t_num)
    return transcript_type, transcript_category, transcript_name
Exemplo n.º 6
0
 def consensus(lst):
     if len(lst) == 0:
         return None
     # first check for read through transcripts involving multiple 
     # reference genes
     same_strand_hits = collections.defaultdict(lambda: [])
     for m in lst:
         category_int = Category.to_int(m.category)
         if category_int == Category.SAME_STRAND:
             same_strand_hits[m.ref_gene_id].append(m)
     # no same strand matches so don't need to worry about
     # read-throughs or multiple gene types
     if len(same_strand_hits) == 0:
         return MatchStats.choose_best(lst)
     # get consensus match from same strand overlapping genes
     total_introns = lst[0].num_introns
     total_length = lst[0].length
     shared_introns = 0
     shared_same_strand_bp = 0
     hits = []
     for genelst in same_strand_hits.itervalues():
         m = MatchStats.choose_best(genelst).copy()
         m.ref_gene_type = ','.join(sorted(set(m.ref_gene_type for m in genelst)))           
         total_introns += m.ref_num_introns
         total_length += m.ref_length
         shared_introns += m.shared_introns
         shared_same_strand_bp += m.shared_same_strand_bp
         hits.append(m)
     # sort reference genes by position
     hits = MatchStats.sort_genome(hits)
     # make a new MatchStats object
     hit = hits[0].copy()
     hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits)
     hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits)
     hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits)
     hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits)
     hit.ref_source = ','.join(x.ref_source for x in hits)
     hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits)
     hit.ref_locus = ','.join(x.ref_locus for x in hits)
     hit.ref_length = ','.join(str(x.ref_length) for x in hits)
     hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits)
     hit.shared_same_strand_bp = shared_same_strand_bp
     hit.shared_opp_strand_bp = 0
     hit.shared_introns = shared_introns
     hit.shared_splicing = any(m.shared_splicing for m in hits)
     hit.distance = 0
     if len(same_strand_hits) > 1:
         hit.category = Category.to_str(Category.READ_THROUGH)
     return hit
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--rename', dest='rename', action='store_true')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    rename = args.rename
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            catstr = t.attrs['category']
            catint = Category.to_int(catstr)
            gene_type = t.attrs.get('gene_type', None)
            ref_gene_type = t.attrs['ref_gene_type']
            if catint == Category.SAME_STRAND:
                # impute gene type
                new_gene_type = ref_gene_type
            else:
                if gene_type == 'protein_coding':
                    # don't change protein coding genes
                    new_gene_type = gene_type
                elif t.length < 250:
                    # categorize small RNA separately
                    new_gene_type = 'misc_RNA'
                else:
                    if ref_gene_type == 'protein_coding':
                        # categorize based on overlap with reference
                        new_gene_type = PROTEIN_CATEGORY_MAP[catint]
                    else:
                        # reference is also non-coding
                        new_gene_type = 'lincRNA'
            # get gene category
            gene_category = GENCODE_CATEGORY_MAP[new_gene_type]
            new_gene_name = None
            if rename:
                # resolve upper/lower case issue with gene names from 
                # different databases
                ref_gene_name = t.attrs['ref_gene_name'].upper()
                # build new gene name                
                if ref_gene_name == 'NONE':
                    new_gene_name = str(t.attrs['source'])
                elif catint == Category.SAME_STRAND:
                    new_gene_name = str(ref_gene_name)
                else:
                    new_gene_name = '%s.%s' % (ref_gene_name, catstr)
                # gene name string is key to a dictionary that
                # associates each gene id with an integer number
                gene_id = t.attrs['gene_id']
                gene_dict = gene_map[new_gene_name]
                if gene_id not in gene_dict:
                    gene_num = len(gene_dict) + 1
                    gene_dict[gene_id] = gene_num
                else:
                    gene_num = gene_dict[gene_id]
                # gene id is also key to dict that associates each isoform
                # of gene with integer number
                t_id = t.attrs['transcript_id']
                t_dict = transcript_map[gene_id]
                if t_id not in t_dict:
                    t_num = len(t_dict) + 1
                    t_dict[t_id] = t_num
                else:
                    t_num = t_dict[t_id]
                # append gene/transcript integers to gene name
                new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['gene_type'] = new_gene_type
                f.attrs['gene_category'] = gene_category 
                if rename:
                    if 'gene_name' in f.attrs:
                        f.attrs['orig_gene_name'] = f.attrs['gene_name']
                    f.attrs['gene_name'] = new_gene_name        
                print str(f)
            num_transcripts += 1
    return 0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--rename', dest='rename', action='store_true')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    rename = args.rename
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            catstr = t.attrs['category']
            catint = Category.to_int(catstr)
            gene_type = t.attrs.get('gene_type', None)
            ref_gene_type = t.attrs['ref_gene_type']
            if catint == Category.SAME_STRAND:
                # impute gene type
                new_gene_type = ref_gene_type
            else:
                if gene_type == 'protein_coding':
                    # don't change protein coding genes
                    new_gene_type = gene_type
                elif t.length < 250:
                    # categorize small RNA separately
                    new_gene_type = 'misc_RNA'
                else:
                    if ref_gene_type == 'protein_coding':
                        # categorize based on overlap with reference
                        new_gene_type = PROTEIN_CATEGORY_MAP[catint]
                    else:
                        # reference is also non-coding
                        new_gene_type = 'lincRNA'
            # get gene category
            gene_category = GENCODE_CATEGORY_MAP[new_gene_type]
            new_gene_name = None
            if rename:
                # resolve upper/lower case issue with gene names from
                # different databases
                ref_gene_name = t.attrs['ref_gene_name'].upper()
                # build new gene name
                if ref_gene_name == 'NONE':
                    new_gene_name = str(t.attrs['source'])
                elif catint == Category.SAME_STRAND:
                    new_gene_name = str(ref_gene_name)
                else:
                    new_gene_name = '%s.%s' % (ref_gene_name, catstr)
                # gene name string is key to a dictionary that
                # associates each gene id with an integer number
                gene_id = t.attrs['gene_id']
                gene_dict = gene_map[new_gene_name]
                if gene_id not in gene_dict:
                    gene_num = len(gene_dict) + 1
                    gene_dict[gene_id] = gene_num
                else:
                    gene_num = gene_dict[gene_id]
                # gene id is also key to dict that associates each isoform
                # of gene with integer number
                t_id = t.attrs['transcript_id']
                t_dict = transcript_map[gene_id]
                if t_id not in t_dict:
                    t_num = len(t_dict) + 1
                    t_dict[t_id] = t_num
                else:
                    t_num = t_dict[t_id]
                # append gene/transcript integers to gene name
                new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['gene_type'] = new_gene_type
                f.attrs['gene_category'] = gene_category
                if rename:
                    if 'gene_name' in f.attrs:
                        f.attrs['orig_gene_name'] = f.attrs['gene_name']
                    f.attrs['gene_name'] = new_gene_name
                print str(f)
            num_transcripts += 1
    return 0