def from_transcript(t, ref=None):
     self = MatchStats()
     self.transcript_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
     self.gene_id = t.attrs[GTFAttr.GENE_ID]
     self.locus = '%s:%d-%d[%s]' % (t.chrom, t.start, t.end,
                                    strand_int_to_str(t.strand))
     self.length = t.length
     self.num_introns = len(t.exons) - 1
     if ref is not None:
         self.ref_transcript_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
         self.ref_gene_id = ref.attrs[GTFAttr.GENE_ID]
         self.ref_locus = '%s:%d-%d[%s]' % (ref.chrom, ref.start, ref.end,
                                            strand_int_to_str(ref.strand))
         self.ref_length = ref.length
         self.ref_num_introns = len(ref.exons) - 1
         self.ref_orig_gene_id = ref.attrs.get('orig_gene_id',
                                               self.ref_gene_id)
         self.ref_source = ref.attrs.get('source', 'NA')
         if 'gene_name' in ref.attrs:
             self.ref_gene_name = ref.attrs['gene_name']
         elif 'transcript_name' in ref.attrs:
             self.ref_gene_name = ref.attrs['transcript_name']
         else:
             self.ref_gene_name = self.ref_gene_id
         if 'gene_type' in ref.attrs:
             self.ref_gene_type = ref.attrs['gene_type']
         elif 'gene_biotype' in ref.attrs:
             self.ref_gene_type = ref.attrs['gene_biotype']
         elif 'transcript_type' in ref.attrs:
             self.ref_gene_type = ref.attrs['transcript_type']
         else:
             self.ref_gene_type = 'None'
     return self
 def from_transcript(t, ref=None):
     self = MatchStats()
     self.transcript_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
     self.gene_id = t.attrs[GTFAttr.GENE_ID]
     self.locus = '%s:%d-%d[%s]' % (t.chrom, t.start, t.end, strand_int_to_str(t.strand))
     self.length = t.length
     self.num_introns = len(t.exons) - 1
     if ref is not None:
         self.ref_transcript_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
         self.ref_gene_id = ref.attrs[GTFAttr.GENE_ID]
         self.ref_locus = '%s:%d-%d[%s]' % (ref.chrom, ref.start, ref.end, strand_int_to_str(ref.strand))
         self.ref_length = ref.length
         self.ref_num_introns = len(ref.exons) - 1
         self.ref_orig_gene_id = ref.attrs.get('orig_gene_id', self.ref_gene_id)            
         self.ref_source = ref.attrs.get('source', 'NA')            
         if 'gene_name' in ref.attrs:
             self.ref_gene_name = ref.attrs['gene_name']
         elif 'transcript_name' in ref.attrs:
             self.ref_gene_name = ref.attrs['transcript_name']
         else:
             self.ref_gene_name = self.ref_gene_id
         if 'gene_type' in ref.attrs:
             self.ref_gene_type = ref.attrs['gene_type']
         elif 'gene_biotype' in ref.attrs:                
             self.ref_gene_type = ref.attrs['gene_biotype']
         elif 'transcript_type' in ref.attrs:
             self.ref_gene_type = ref.attrs['transcript_type']
         else:
             self.ref_gene_type = 'None'
     return self
def write_bed(chrom, name, strand, score, exons, flank, chrom_length):  
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    chr_len = chrom_length[chrom]
    tx_start = exons[0].start
    tx_start = max(0, (tx_start - flank))
    tx_end = exons[-1].end    
    tx_end = min(chr_len, (tx_end + flank))
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)        
    # make bed fields
    fields = [chrom, 
              str(tx_start), 
              str(tx_end),
              str(name),
              str(score),
              strand_int_to_str(strand),
              str(tx_start),
              str(tx_start),
              '0',
              str(len(exons)),
              ','.join(map(str,block_sizes)) + ',',
              ','.join(map(str,block_starts)) + ',']
    return fields
示例#4
0
def find_first_orf(t, ref_fa):
    orf = ORFInfo()
    orf.transcript_id = t.attrs['transcript_id']
    orf.gene_id = t.attrs['gene_id']
    orf.chrom = t.chrom
    # get transcript sequence
    seq = get_transcript_dna_sequence(t, ref_fa)
    # find first ATG in sequence
    start = seq.find('ATG')
    if start == -1:
        orf.start = t.start
        orf.end = t.start
        orf.strand = '.'
        orf.exons = []
        orf.seq = ''
    else:
        aa_seq = translate_orf(seq[start:])
        end = start + 3 * len(aa_seq) - 1
        orf_start, orf_end, orf_exons = \
            orf_to_genome(t, start, end)
        orf.start = orf_start
        orf.end = orf_end
        orf.strand = strand_int_to_str(t.strand)
        orf.exons = orf_exons
        orf.seq = aa_seq
    return orf
def write_bed(chrom, name, strand, score, exons):
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    tx_start = exons[0].start
    tx_end = exons[-1].end
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)
    # make bed fields
    fields = [
        chrom,
        str(tx_start),
        str(tx_end),
        str(name),
        str(score),
        strand_int_to_str(strand),
        str(tx_start),
        str(tx_start),
        "0",
        str(len(exons)),
        ",".join(map(str, block_sizes)) + ",",
        ",".join(map(str, block_starts)) + ",",
    ]
    return fields
示例#6
0
def find_first_orf(t, ref_fa):
    orf = ORFInfo()
    orf.transcript_id = t.attrs['transcript_id']
    orf.gene_id = t.attrs['gene_id']
    orf.chrom = t.chrom
    # get transcript sequence
    seq = get_transcript_dna_sequence(t, ref_fa)
    # find first ATG in sequence
    start = seq.find('ATG')
    if start == -1:
        orf.start = t.start
        orf.end = t.start
        orf.strand = '.'
        orf.exons = []
        orf.seq = ''
    else:
        aa_seq = translate_orf(seq[start:])
        end = start + 3 * len(aa_seq) - 1
        orf_start, orf_end, orf_exons = \
            orf_to_genome(t, start, end)
        orf.start = orf_start
        orf.end = orf_end
        orf.strand = strand_int_to_str(t.strand)
        orf.exons = orf_exons
        orf.seq = aa_seq
    return orf
def assemble_locus(transcripts, locus_id_value_obj, gene_id_value_obj,
                   tss_id_value_obj, t_id_value_obj, config, gtf_fileh,
                   bed_fileh, bedgraph_filehs):
    # gather properties of locus
    locus_chrom = transcripts[0].chrom
    locus_start = transcripts[0].start
    locus_end = max(tx.end for tx in transcripts)
    logging.debug("[LOCUS] %s:%d-%d %d transcripts" %
                  (locus_chrom, locus_start, locus_end, len(transcripts)))
    locus_id_str = "L%d" % (locus_id_value_obj.next())
    # filter transcripts
    logging.debug("\tFiltering transcripts")
    transcripts = filter_transcripts(transcripts, config.min_transcript_length,
                                     config.guided)
    # build transcript graphs
    transcript_graphs = \
        create_transcript_graphs(locus_chrom, transcripts,
                                 min_trim_length=config.min_trim_length,
                                 trim_utr_fraction=config.trim_utr_fraction,
                                 trim_intron_fraction=config.trim_intron_fraction,
                                 create_bedgraph=config.create_bedgraph,
                                 bedgraph_filehs=bedgraph_filehs)
    for tg in transcript_graphs:
        logging.debug(
            "Subgraph %s:%d-%d(%s) %d nodes %d paths" %
            (locus_chrom, locus_start, locus_end, strand_int_to_str(
                tg.strand), len(tg.Gsub), len(tg.partial_paths)))
        # assemble subgraph
        assemble_gene(locus_chrom, locus_id_str, gene_id_value_obj,
                      tss_id_value_obj, t_id_value_obj, tg.Gsub, tg.strand,
                      tg.partial_paths, config, gtf_fileh, bed_fileh)
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {"locus_id": locus_id, "gene_id": gene_id, "tss_id": tss_id, "transcript_id": transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = "assemblyline"
    f.feature_type = "transcript"
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = "."
    f.attrs = {"score": "%.3f" % score, "frac": "%.3f" % frac}
    f.attrs.update(attr_dict)
    yield f
    for i, e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = "assemblyline"
        f.feature_type = "exon"
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = "."
        f.attrs = {"exon_number": i + 1}
        f.attrs.update(attr_dict)
        yield f
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, 
                     transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = 'assemblyline'
    f.feature_type = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'score': '%.3f' % score,
               'frac': '%.3f' % frac}
    f.attrs.update(attr_dict)
    yield f
    for i,e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = 'assemblyline'
        f.feature_type = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {'exon_number': i+1}
        f.attrs.update(attr_dict)
        yield f
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id, 
                     transcript_id, score, frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = strand_int_to_str(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTFFeature()
    f.seqid = chrom
    f.source = 'assemblyline'
    f.feature_type = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = 1000.0 * int(round(frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'score': '%.3f' % score,
               'frac': '%.3f' % frac}
    f.attrs.update(attr_dict)
    yield f
    for i,e in enumerate(exons):
        f = GTFFeature()
        f.seqid = chrom
        f.source = 'assemblyline'
        f.feature_type = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {'exon_number': i+1}
        f.attrs.update(attr_dict)
        yield f
def assemble_locus(
    transcripts,
    locus_id_value_obj,
    gene_id_value_obj,
    tss_id_value_obj,
    t_id_value_obj,
    config,
    gtf_fileh,
    bed_fileh,
    bedgraph_filehs,
):
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE])
            yield fields

    # gather properties of locus
    locus_chrom = transcripts[0].chrom
    locus_start = transcripts[0].start
    locus_end = max(tx.end for tx in transcripts)
    logging.debug("[LOCUS] %s:%d-%d %d transcripts" % (locus_chrom, locus_start, locus_end, len(transcripts)))
    locus_id_str = "L%d" % (locus_id_value_obj.next())
    # filter transcripts
    logging.debug("\tFiltering transcripts")
    transcripts = filter_transcripts(transcripts, config.min_transcript_length, config.guided)
    # build transcript graphs
    for G, strand, strand_transcripts in create_transcript_graphs(transcripts):
        # output bedgraph
        if config.create_bedgraph:
            for fields in get_bedgraph_lines(locus_chrom, G):
                print >>bedgraph_filehs[strand], "\t".join(map(str, fields))
        # process transcript graphs
        for Gsub, strand, partial_paths in prune_transcript_graph(
            G, strand, strand_transcripts, config.min_trim_length, config.trim_utr_fraction, config.trim_intron_fraction
        ):
            logging.debug(
                "Subgraph %s:%d-%d(%s) %d nodes %d paths"
                % (locus_chrom, locus_start, locus_end, strand_int_to_str(strand), len(Gsub), len(partial_paths))
            )
            # assemble subgraph
            assemble_gene(
                locus_chrom,
                locus_id_str,
                gene_id_value_obj,
                tss_id_value_obj,
                t_id_value_obj,
                Gsub,
                strand,
                partial_paths,
                config,
                gtf_fileh,
                bed_fileh,
            )
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', type=int, dest='upstream', default=1000)
    parser.add_argument('-d', type=int, dest='downstream', default=0)
    parser.add_argument('gtf_file')
    parser.add_argument('chrom_sizes')
    args = parser.parse_args()
    upstream = args.upstream
    downstream = args.downstream
    chrom_sizes_file = args.chrom_sizes
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    chrom_sizes = {}
    with open(chrom_sizes_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom_sizes[fields[0]] = int(fields[1])
    # parse
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        tss_ids = set()
        for t in locus_transcripts:
            if t.strand == NO_STRAND:
                continue
            tss_id = t.attrs['tss_id']
            if tss_id in tss_ids:
                continue
            tss_ids.add(tss_id)
            if t.strand == POS_STRAND:
                start = t.exons[0].start - upstream
                start = max(0, start)
                end = t.exons[0].start + downstream
                end = min(t.end, end)
            else:
                start = t.exons[-1].end - downstream
                start = max(t.start, start)
                end = t.exons[-1].end + upstream
                end = min(end, chrom_sizes[locus_chrom])
            print '\t'.join(
                map(str, [
                    locus_chrom, start, end, tss_id, 0,
                    strand_int_to_str(t.strand)
                ]))

    return 0
def assemble_locus(transcripts,
                   locus_id_value_obj,
                   gene_id_value_obj,
                   tss_id_value_obj,
                   t_id_value_obj,
                   config,
                   gtf_fileh,
                   bed_fileh,
                   bedgraph_filehs):
    def get_bedgraph_lines(chrom, G):
        for n in sorted(G.nodes()):
            if n.start < 0:
                continue
            fields = (chrom, n.start, n.end, G.node[n][NODE_SCORE]) 
            yield fields
    # gather properties of locus
    locus_chrom = transcripts[0].chrom
    locus_start = transcripts[0].start
    locus_end = max(tx.end for tx in transcripts)
    logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                  (locus_chrom, locus_start, locus_end, 
                   len(transcripts)))
    locus_id_str = "L%d" % (locus_id_value_obj.next())
    # filter transcripts
    logging.debug("\tFiltering transcripts")
    transcripts = filter_transcripts(transcripts, 
                                     config.min_transcript_length,
                                     config.guided)
    # build transcript graphs
    for G, strand, strand_transcripts in \
        create_transcript_graphs(transcripts):
        # output bedgraph
        if config.create_bedgraph:
            for fields in get_bedgraph_lines(locus_chrom, G):
                print >>bedgraph_filehs[strand], '\t'.join(map(str,fields))
        # process transcript graphs
        for Gsub, strand, partial_paths in \
            prune_transcript_graph(G, strand, strand_transcripts,
                                   config.min_trim_length, 
                                   config.trim_utr_fraction,
                                   config.trim_intron_fraction):
            logging.debug("Subgraph %s:%d-%d(%s) %d nodes %d paths" %
                           (locus_chrom, locus_start, locus_end,
                            strand_int_to_str(strand), len(Gsub),
                            len(partial_paths)))
            # assemble subgraph
            assemble_gene(locus_chrom, locus_id_str, 
                          gene_id_value_obj,
                          tss_id_value_obj,
                          t_id_value_obj,
                          Gsub, strand, partial_paths, 
                          config,
                          gtf_fileh,
                          bed_fileh)
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", action="store_true", 
                        dest="verbose", default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(level=level,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")   
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >>outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >>outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)]
            print '\t'.join(fields)    
    return 0
示例#15
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', type=int, dest='upstream', default=1000)
    parser.add_argument('-d', type=int, dest='downstream', default=0)
    parser.add_argument('gtf_file')
    parser.add_argument('chrom_sizes')
    args = parser.parse_args()
    upstream = args.upstream
    downstream = args.downstream
    chrom_sizes_file = args.chrom_sizes
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    chrom_sizes = {}
    with open(chrom_sizes_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom_sizes[fields[0]] = int(fields[1])
    # parse
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        tss_ids = set()
        for t in locus_transcripts:            
            if t.strand == NO_STRAND:
                continue
            tss_id = t.attrs['tss_id']
            if tss_id in tss_ids:
                continue
            tss_ids.add(tss_id)
            if t.strand == POS_STRAND:
                start = t.exons[0].start - upstream
                start = max(0, start)
                end = t.exons[0].start + downstream
                end = min(t.end, end)
            else:
                start = t.exons[-1].end - downstream
                start = max(t.start, start)
                end = t.exons[-1].end + upstream
                end = min(end, chrom_sizes[locus_chrom])
            print '\t'.join(map(str, [locus_chrom, start, end, tss_id, 0, strand_int_to_str(t.strand)]))

    return 0
def assemble_locus(transcripts,
                   locus_id_value_obj,
                   gene_id_value_obj,
                   tss_id_value_obj,
                   t_id_value_obj,
                   config,
                   gtf_fileh,
                   bed_fileh,
                   bedgraph_filehs):
    # gather properties of locus
    locus_chrom = transcripts[0].chrom
    locus_start = transcripts[0].start
    locus_end = max(tx.end for tx in transcripts)
    logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                  (locus_chrom, locus_start, locus_end, 
                   len(transcripts)))
    locus_id_str = "L%d" % (locus_id_value_obj.next())
    # filter transcripts
    logging.debug("\tFiltering transcripts")
    transcripts = filter_transcripts(transcripts, 
                                     config.min_transcript_length,
                                     config.guided)
    # build transcript graphs
    transcript_graphs = \
        create_transcript_graphs(locus_chrom, transcripts, 
                                 min_trim_length=config.min_trim_length, 
                                 trim_utr_fraction=config.trim_utr_fraction,
                                 trim_intron_fraction=config.trim_intron_fraction,
                                 create_bedgraph=config.create_bedgraph,
                                 bedgraph_filehs=bedgraph_filehs)    
    for tg in transcript_graphs:
        logging.debug("Subgraph %s:%d-%d(%s) %d nodes %d paths" %
                       (locus_chrom, locus_start, locus_end,
                        strand_int_to_str(tg.strand), len(tg.Gsub),
                        len(tg.partial_paths)))
        # assemble subgraph
        assemble_gene(locus_chrom, locus_id_str, 
                      gene_id_value_obj,
                      tss_id_value_obj,
                      t_id_value_obj,
                      tg.Gsub, tg.strand, tg.partial_paths, 
                      config,
                      gtf_fileh,
                      bed_fileh)
def get_all_transcript_orfs(t, ref_fa, min_orf_length):
    # get amino acid sequences in all reading frames
    aa_seqs = translate_transcript(t, ref_fa)
    # get ORFs
    for frame, aa_seq in enumerate(aa_seqs):
        for aa_start, aa_end, orf_seq in find_orfs(aa_seq):
            if len(orf_seq) < min_orf_length:
                continue
            orf_start, orf_end, orf_exons = \
                orf_to_genome(t, frame, aa_start, aa_end)
            orf = ORFInfo()
            orf.transcript_id = t.attrs['transcript_id']
            orf.gene_id = t.attrs['gene_id']
            orf.frame = frame
            orf.chrom = t.chrom
            orf.start = orf_start
            orf.end = orf_end
            orf.strand = strand_int_to_str(t.strand)
            orf.exons = orf_exons
            orf.seq = orf_seq
            yield orf
def get_all_transcript_orfs(t, ref_fa, min_orf_length):
    # get amino acid sequences in all reading frames
    aa_seqs = translate_transcript(t, ref_fa)
    # get ORFs
    for frame, aa_seq in enumerate(aa_seqs):
        for aa_start, aa_end, orf_seq in find_orfs(aa_seq):
            if len(orf_seq) < min_orf_length:
                continue
            orf_start, orf_end, orf_exons = \
                orf_to_genome(t, frame, aa_start, aa_end)
            orf = ORFInfo()
            orf.transcript_id = t.attrs['transcript_id']
            orf.gene_id = t.attrs['gene_id']
            orf.frame = frame
            orf.chrom = t.chrom
            orf.start = orf_start
            orf.end = orf_end
            orf.strand = strand_int_to_str(t.strand)
            orf.exons = orf_exons
            orf.seq = orf_seq
            yield orf
def write_bed(chrom, name, strand, score, exons):
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    tx_start = exons[0].start
    tx_end = exons[-1].end    
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)        
    # make bed fields
    fields = [chrom, 
              str(tx_start), 
              str(tx_end),
              str(name),
              str(score),
              strand_int_to_str(strand),
              str(tx_start),
              str(tx_start),
              '0',
              str(len(exons)),
              ','.join(map(str,block_sizes)) + ',',
              ','.join(map(str,block_starts)) + ',']
    return fields
示例#20
0
def trim_graph(G, strand, min_trim_length, trim_utr_fraction,
               trim_intron_fraction):
    # get 'chains' of contiguous non-intron nodes with edge degree of
    # one or less
    node_chain_map, chains = get_chains(G, introns=False)
    # setup dictionaries of predecessors and successors
    successor_dict = {}
    for n, nbrdict in G.adjacency_iter():
        successor_dict[n] = nbrdict.keys()
    predecessor_dict = {}
    G.reverse(copy=False)
    for n, nbrdict in G.adjacency_iter():
        predecessor_dict[n] = nbrdict.keys()
    G.reverse(copy=False)
    # setup intron data structures
    introns = {}
    intron_tree = IntervalTree()
    reverse = (strand == NEG_STRAND)
    for u, nbrdict in G.adjacency_iter():
        for v in nbrdict:
            if reverse:
                left, right = v, u
            else:
                left, right = u, v
            # skip contiguous nodes
            if left.end == right.start:
                continue
            # calculate score of the chains
            u_chain_nodes = chains[node_chain_map[u]]
            u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes)
            v_chain_nodes = chains[node_chain_map[v]]
            v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes)
            # store scores in intron data structures
            introns[(left.end, right.start)] = (u_score, v_score)
            intron_tree.insert_interval(
                Interval(left.end, right.start, value=(u_score, v_score)))
    # trim chains
    all_trim_nodes = set()
    for parent, nodes in chains.iteritems():
        if strand == NEG_STRAND:
            nodes.reverse()
        in_degree = len(predecessor_dict[nodes[0]])
        out_degree = len(successor_dict[nodes[-1]])
        trim_nodes = set()
        if ((in_degree == 1) and (out_degree == 1)
                and (parent.start, parent.end) in introns):
            # intron retention - a chain of nodes precisely matches an
            # intron, so we can potentially remove the entire chain
            pred_score, succ_score = introns[(parent.start, parent.end)]
            cutoff_score = trim_intron_fraction * max(pred_score, succ_score)
            trim_nodes.update(trim_intron(G, nodes, cutoff_score))
        else:
            # determine whether this node chain is intronic. intronic node
            # chains are trimmed more strictly due to intronic pre-mrna
            found_intron = False
            max_pred_score = 0.0
            max_succ_score = 0.0
            for hit in intron_tree.find(parent.start, parent.end):
                # ignore contained introns
                if (hit.start > parent.start) and (hit.end < parent.end):
                    continue
                # set intron flag and keep track of highest coverage
                # overlapping intron to make trimming conservative
                found_intron = True
                pred_score, succ_score = hit.value
                if pred_score > max_pred_score:
                    max_pred_score = pred_score
                if succ_score > max_succ_score:
                    max_succ_score = succ_score
            if (in_degree == 0) and (out_degree == 0):
                if found_intron:
                    cutoff_score = trim_intron_fraction * max(
                        max_pred_score, max_succ_score)
                    trim_nodes.update(trim_intron(G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_bidirectional(G, nodes, min_trim_length,
                                       trim_utr_fraction))
            elif in_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_succ_score
                    trim_nodes.update(trim_intronic_utr(
                        G, nodes, cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes[::-1], min_trim_length,
                             trim_utr_fraction))
            elif out_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_pred_score
                    trim_nodes.update(
                        trim_intronic_utr(G, nodes[::-1], cutoff_score))
                trim_nodes.update(
                    trim_utr(G, nodes, min_trim_length, trim_utr_fraction))
        all_trim_nodes.update(trim_nodes)
    if len(all_trim_nodes) > 0:
        logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" %
                      (strand_int_to_str(strand), len(all_trim_nodes), len(G)))
    return all_trim_nodes
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        dest="verbose",
                        default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >> outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >> outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [
                locus_chrom,
                str(start),
                str(end),
                '%s|%s|%s' % (m, t, c), '0',
                strand_int_to_str(strand)
            ]
            print '\t'.join(fields)
    return 0
示例#22
0
def categorize_transcript(t, locus_trees):
    # determine whether gene overlaps known loci
    # intersect transcript with reference loci
    t_strand = strand_int_to_str(t.strand)
    locus_hits = locus_trees[t.chrom].find(t.start, t.end)
    if len(locus_hits) == 0:
        # this is a completely unannotated transcript
        category = CATEGORY_INTERGENIC
        nearest_genes, nearest_dist = get_nearest_genes(t.chrom, t.start, t.end, locus_trees)
    else:
        # this transcript overlaps at least one known locus, so
        # categorize as sense/antisense, coding/noncoding, exon/intron
        protein_genes = {}
        ncrna_genes = {}
        antisense_genes = {}
        for locus_hit in locus_hits:
            gene_tree = locus_hit.value
            for exon in t.exons:
                for gene_hit in gene_tree.find(exon.start, exon.end):
                    g = gene_hit.value
                    if cmp_strand(g.strand, t_strand):
                        if g.is_coding:
                            protein_genes[g.gene_id] = g
                        else:
                            ncrna_genes[g.gene_id] = g
                    else:
                        antisense_genes[g.gene_id] = g
        protein_genes = protein_genes.values()
        ncrna_genes = ncrna_genes.values()
        antisense_genes = antisense_genes.values()
        nearest_dist = 0
        if len(protein_genes) > 0:
            category = CATEGORY_PROTEIN
            nearest_genes = protein_genes
        elif len(ncrna_genes) > 0:
            category = CATEGORY_NCRNA
            nearest_genes = ncrna_genes
        elif len(antisense_genes) > 0:
            category = CATEGORY_ANTISENSE
            nearest_genes = antisense_genes
        else:
            category = CATEGORY_INTRONIC
            nearest_genes = []
            for locus_hit in locus_hits:
                gene_tree = locus_hit.value
                gene_hits = gene_tree.find(locus_hit.start, locus_hit.end)
                for gene_hit in gene_hits:
                    nearest_genes.append(gene_hit.value)
    # using 'nearest genes' list get gene names and annotation sources    
    if len(nearest_genes) == 0:
        gene_ids = "NA"
        gene_names = "NA"
        annotation_sources = "NA"
        nearest_dist = -1
    else:
        gene_ids = set()
        gene_names = set()
        annotation_sources = set()
        for g in nearest_genes:
            gene_ids.add(g.gene_id)
            gene_names.update(g.gene_names)
            annotation_sources.update(g.annotation_sources)
        gene_ids = ",".join(sorted(gene_ids))
        gene_names = ",".join(sorted(gene_names))
        annotation_sources = ",".join(sorted(annotation_sources))
    # add attributes to original transcripts
    t.attrs["category"] = category
    t.attrs["nearest_gene_ids"] = gene_ids
    t.attrs["nearest_gene_names"] = gene_names
    t.attrs["nearest_dist"] = nearest_dist
    t.attrs["annotation_sources"] = annotation_sources
示例#23
0
def trim_graph(G, strand,
               min_trim_length, 
               trim_utr_fraction,
               trim_intron_fraction):
    # get 'chains' of contiguous non-intron nodes with edge degree of 
    # one or less
    node_chain_map, chains = get_chains(G, introns=False)
    # setup dictionaries of predecessors and successors
    successor_dict = {}
    for n,nbrdict in G.adjacency_iter():
        successor_dict[n] = nbrdict.keys()
    predecessor_dict = {}
    G.reverse(copy=False)
    for n,nbrdict in G.adjacency_iter():
        predecessor_dict[n] = nbrdict.keys()
    G.reverse(copy=False)
    # setup intron data structures
    introns = {}
    intron_tree = IntervalTree()
    reverse = (strand == NEG_STRAND)
    for u,nbrdict in G.adjacency_iter():
        for v in nbrdict:
            if reverse:
                left, right = v, u
            else:
                left, right = u, v
            # skip contiguous nodes
            if left.end == right.start:
                continue
            # calculate score of the chains
            u_chain_nodes = chains[node_chain_map[u]]
            u_score = max(G.node[n][NODE_SCORE] for n in u_chain_nodes)
            v_chain_nodes = chains[node_chain_map[v]]
            v_score = max(G.node[n][NODE_SCORE] for n in v_chain_nodes)
            # store scores in intron data structures
            introns[(left.end,right.start)] = (u_score, v_score)
            intron_tree.insert_interval(Interval(left.end, right.start, 
                                                 value=(u_score,v_score)))
    # trim chains
    all_trim_nodes = set()
    for parent, nodes in chains.iteritems():
        if strand == NEG_STRAND:
            nodes.reverse()
        in_degree = len(predecessor_dict[nodes[0]])
        out_degree = len(successor_dict[nodes[-1]])
        trim_nodes = set()
        if ((in_degree == 1) and (out_degree == 1) and
            (parent.start, parent.end) in introns): 
            # intron retention - a chain of nodes precisely matches an 
            # intron, so we can potentially remove the entire chain
            pred_score, succ_score = introns[(parent.start, parent.end)]
            cutoff_score = trim_intron_fraction * max(pred_score, succ_score)
            trim_nodes.update(trim_intron(G, nodes, cutoff_score))
        else:
            # determine whether this node chain is intronic. intronic node
            # chains are trimmed more strictly due to intronic pre-mrna
            found_intron = False
            max_pred_score = 0.0
            max_succ_score = 0.0
            for hit in intron_tree.find(parent.start, parent.end):
                # ignore contained introns
                if (hit.start > parent.start) and (hit.end < parent.end):
                    continue
                # set intron flag and keep track of highest coverage 
                # overlapping intron to make trimming conservative
                found_intron = True
                pred_score, succ_score = hit.value
                if pred_score > max_pred_score:
                    max_pred_score = pred_score
                if succ_score > max_succ_score:
                    max_succ_score = succ_score
            if (in_degree == 0) and (out_degree == 0):
                if found_intron:
                    cutoff_score = trim_intron_fraction * max(max_pred_score, max_succ_score)
                    trim_nodes.update(trim_intron(G, nodes, cutoff_score))
                trim_nodes.update(trim_bidirectional(G, nodes, min_trim_length, trim_utr_fraction))
            elif in_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_succ_score
                    trim_nodes.update(trim_intronic_utr(G, nodes, cutoff_score))
                trim_nodes.update(trim_utr(G, nodes[::-1], min_trim_length, trim_utr_fraction))
            elif out_degree == 0:
                if found_intron:
                    cutoff_score = trim_intron_fraction * max_pred_score
                    trim_nodes.update(trim_intronic_utr(G, nodes[::-1], cutoff_score))
                trim_nodes.update(trim_utr(G, nodes, min_trim_length, trim_utr_fraction))
        all_trim_nodes.update(trim_nodes)
    if len(all_trim_nodes) > 0:
        logging.debug("\t\t(%s) trimmed %d/%d nodes from graph" % 
                      (strand_int_to_str(strand), len(all_trim_nodes), 
                       len(G)))
    return all_trim_nodes