예제 #1
0
파일: assemble.py 프로젝트: yniknafs/taco
def assemble_gene(sgraph, locus_id_str, config):
    genome_id_str = ('%s:%d-%d[%s]' %
                     (sgraph.chrom, sgraph.start, sgraph.end,
                      Strand.to_gtf(sgraph.strand)))
    logging.debug('%s locus: %s nodes: %d' %
                  (genome_id_str, locus_id_str, len(sgraph.G)))
    # output splice graph node data
    for f in sgraph.get_node_gtf():
        print >>config.splice_graph_gtf_fh, str(f)

    if config.change_point:
        # detect change points
        changepts = sgraph.detect_change_points(
            pval=config.change_point_pvalue,
            fc_cutoff=config.change_point_fold_change)
        logging.debug('%s locus %s change points: %d' %
                      (genome_id_str, locus_id_str, len(changepts)))
        for cp in changepts:
            sgraph.apply_change_point(cp, config.change_point_trim)
            # output splice graph change points
            for f in sgraph.get_change_point_gtf(cp):
                print >>config.splice_graph_gtf_fh, str(f)
        # must recreate splice graph after finding change points
        if len(changepts) > 0:
            sgraph.recreate()

    # run isoform path finding algorithm, filter and group into genes
    for gene_isoforms in assemble_isoforms(sgraph, config):
        # assign gene_id and tss_id
        assign_ids(gene_isoforms, sgraph.strand, config.gene_id_iter,
                   config.tss_id_iter)
        # write output
        for isoform in gene_isoforms:
            # assign transcript id
            t_id = config.t_id_iter.next()
            # get strings for each id
            t_id_str = "TU%d" % t_id
            tss_id_str = "TSS%d" % (isoform.tss_id)
            gene_id_str = "G%d" % (isoform.gene_id)
            # write to GTF
            for f in get_gtf_features(chrom=sgraph.chrom,
                                      strand=sgraph.strand,
                                      exons=isoform.path,
                                      locus_id=locus_id_str,
                                      gene_id=gene_id_str,
                                      tss_id=tss_id_str,
                                      transcript_id=t_id_str,
                                      expr=isoform.expr,
                                      rel_frac=isoform.rel_frac,
                                      abs_frac=isoform.abs_frac):
                print >>config.assembly_gtf_fh, str(f)
            # write to BED
            name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr)
            fields = write_bed(sgraph.chrom, name, sgraph.strand,
                               int(round(1000.0 * isoform.rel_frac)),
                               isoform.path)
            print >>config.assembly_bed_fh, '\t'.join(fields)
예제 #2
0
def assemble_gene(sgraph, locus_id_str, config):
    genome_id_str = (
        '%s:%d-%d[%s]' %
        (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)))
    logging.debug('%s locus: %s nodes: %d' %
                  (genome_id_str, locus_id_str, len(sgraph.G)))
    # output splice graph node data
    for f in sgraph.get_node_gtf():
        print >> config.splice_graph_gtf_fh, str(f)

    if config.change_point:
        # detect change points
        changepts = sgraph.detect_change_points(
            pval=config.change_point_pvalue,
            fc_cutoff=config.change_point_fold_change)
        logging.debug('%s locus %s change points: %d' %
                      (genome_id_str, locus_id_str, len(changepts)))
        for cp in changepts:
            sgraph.apply_change_point(cp, config.change_point_trim)
            # output splice graph change points
            for f in sgraph.get_change_point_gtf(cp):
                print >> config.splice_graph_gtf_fh, str(f)
        # must recreate splice graph after finding change points
        if len(changepts) > 0:
            sgraph.recreate()

    # run isoform path finding algorithm, filter and group into genes
    for gene_isoforms in assemble_isoforms(sgraph, config):
        # assign gene_id and tss_id
        assign_ids(gene_isoforms, sgraph.strand, config.gene_id_iter,
                   config.tss_id_iter)
        # write output
        for isoform in gene_isoforms:
            # assign transcript id
            t_id = config.t_id_iter.next()
            # get strings for each id
            t_id_str = "TU%d" % t_id
            tss_id_str = "TSS%d" % (isoform.tss_id)
            gene_id_str = "G%d" % (isoform.gene_id)
            # write to GTF
            for f in get_gtf_features(chrom=sgraph.chrom,
                                      strand=sgraph.strand,
                                      exons=isoform.path,
                                      locus_id=locus_id_str,
                                      gene_id=gene_id_str,
                                      tss_id=tss_id_str,
                                      transcript_id=t_id_str,
                                      expr=isoform.expr,
                                      rel_frac=isoform.rel_frac,
                                      abs_frac=isoform.abs_frac):
                print >> config.assembly_gtf_fh, str(f)
            # write to BED
            name = "%s|%s(%.1f)" % (gene_id_str, t_id_str, isoform.expr)
            fields = write_bed(sgraph.chrom, name, sgraph.strand,
                               int(round(1000.0 * isoform.rel_frac)),
                               isoform.path)
            print >> config.assembly_bed_fh, '\t'.join(fields)
예제 #3
0
def test_multi_strand2():
    t_dict, locus = read_single_locus("multi_strand2.gtf")
    transfrags_pos = locus.get_transfrags(Strand.POS)
    sgpos = SpliceGraph.create(transfrags_pos)
    sgdict = {}
    for sg in sgpos.split():
        k = "%s:%d-%d[%s]" % (sg.chrom, sg.start, sg.end, Strand.to_gtf(sg.strand))
        sgdict[k] = sg
    assert "chr1:100-300[+]" in sgdict
    assert "chr1:400-600[+]" in sgdict
예제 #4
0
def test_multi_strand2():
    t_dict, locus = read_single_locus('multi_strand2.gtf')
    transfrags_pos = locus.get_transfrags(Strand.POS)
    sgpos = SpliceGraph.create(transfrags_pos)
    sgdict = {}
    for sg in sgpos.split():
        k = ('%s:%d-%d[%s]' % (sg.chrom, sg.start, sg.end,
             Strand.to_gtf(sg.strand)))
        sgdict[k] = sg
    assert 'chr1:100-300[+]' in sgdict
    assert 'chr1:400-600[+]' in sgdict
예제 #5
0
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
                     transcript_id, expr, rel_frac, abs_frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = Strand.to_gtf(strand)
    attr_dict = {
        'locus_id': locus_id,
        'gene_id': gene_id,
        'tss_id': tss_id,
        'transcript_id': transcript_id
    }
    f = GTF.Feature()
    f.seqid = chrom
    f.source = 'taco'
    f.feature = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = int(round(1000.0 * rel_frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {
        'expr': '%.3f' % expr,
        'rel_frac': '%.5f' % rel_frac,
        'abs_frac': '%.5f' % abs_frac
    }
    f.attrs.update(attr_dict)
    yield f
    for e in exons:
        f = GTF.Feature()
        f.seqid = chrom
        f.source = 'taco'
        f.feature = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(1000.0 * rel_frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {}
        f.attrs.update(attr_dict)
        yield f
예제 #6
0
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              loss_threshold=config.path_graph_loss_threshold,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()

    genome_id_str = (
        '%s:%d-%d[%s]' %
        (sgraph.chrom, sgraph.start, sgraph.end, Strand.to_gtf(sgraph.strand)))
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths):
        path = reconstruct_path(kmer_path, K, sgraph)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (genome_id_str, len(paths)))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (genome_id_str, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(
                Isoform(path=path,
                        expr=expr,
                        rel_frac=rel_frac,
                        abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
예제 #7
0
파일: assemble.py 프로젝트: yniknafs/taco
def assemble_isoforms(sgraph, config):
    # read in transfrag paths
    pgf = PathGraphFactory(sgraph)
    K, k = pgf.create_optimal(kmax=config.path_graph_kmax,
                              loss_threshold=config.path_graph_loss_threshold,
                              stats_fh=config.path_graph_stats_fh)
    if K is None or len(K) == 0:
        return []
    # smooth kmer graph
    K.apply_smoothing()

    genome_id_str = ('%s:%d-%d[%s]' %
                     (sgraph.chrom, sgraph.start, sgraph.end,
                      Strand.to_gtf(sgraph.strand)))
    logging.debug('%s finding isoforms in k=%d graph (%d kmers) '
                  'source_expr=%f' %
                  (genome_id_str, k, len(K), K.exprs[K.SOURCE_ID]))
    paths = []
    for kmer_path, expr in find_paths(K, config.path_frac, config.max_paths):
        path = reconstruct_path(kmer_path, K, sgraph)
        paths.append((path, expr))
    logging.debug('%s isoforms: %d' % (genome_id_str, len(paths)))
    # build gene clusters
    clusters, filtered = Cluster.build(paths, min_frac=config.isoform_frac)
    logging.debug('%s gene clusters: %d filtered transfrags: %d' %
                  (genome_id_str, len(clusters), len(filtered)))
    gene_isoforms = []
    for cluster in clusters:
        isoforms = []
        for path, expr, rel_frac, abs_frac in cluster.iterpaths():
            isoforms.append(Isoform(path=path, expr=expr, rel_frac=rel_frac,
                                    abs_frac=abs_frac))
        # apply max isoforms limit (per cluster)
        if config.max_isoforms > 0:
            isoforms = isoforms[:config.max_isoforms]
        gene_isoforms.append(isoforms)
    return gene_isoforms
예제 #8
0
def write_bed(chrom, name, strand, score, exons):
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    tx_start = exons[0].start
    tx_end = exons[-1].end
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)
    # make bed fields
    fields = [
        chrom,
        str(tx_start),
        str(tx_end),
        str(name),
        str(score),
        Strand.to_gtf(strand),
        str(tx_start),
        str(tx_start), '0',
        str(len(exons)), ','.join(map(str, block_sizes)) + ',',
        ','.join(map(str, block_starts)) + ','
    ]
    return fields
예제 #9
0
파일: assemble.py 프로젝트: tacorna/taco
def write_bed(chrom, name, strand, score, exons):
    assert all(exons[0].start < x.start for x in exons[1:])
    assert all(exons[-1].end > x.end for x in exons[:-1])
    tx_start = exons[0].start
    tx_end = exons[-1].end
    block_sizes = []
    block_starts = []
    for e in exons:
        block_starts.append(e.start - tx_start)
        block_sizes.append(e.end - e.start)
    # make bed fields
    fields = [chrom,
              str(tx_start),
              str(tx_end),
              str(name),
              str(score),
              Strand.to_gtf(strand),
              str(tx_start),
              str(tx_start),
              '0',
              str(len(exons)),
              ','.join(map(str, block_sizes)) + ',',
              ','.join(map(str, block_starts)) + ',']
    return fields
예제 #10
0
파일: assemble.py 프로젝트: tacorna/taco
def get_gtf_features(chrom, strand, exons, locus_id, gene_id, tss_id,
                     transcript_id, expr, rel_frac, abs_frac):
    tx_start = exons[0].start
    tx_end = exons[-1].end
    strand_str = Strand.to_gtf(strand)
    attr_dict = {'locus_id': locus_id,
                 'gene_id': gene_id,
                 'tss_id': tss_id,
                 'transcript_id': transcript_id}
    f = GTF.Feature()
    f.seqid = chrom
    f.source = 'taco'
    f.feature = 'transcript'
    f.start = tx_start
    f.end = tx_end
    f.score = int(round(1000.0 * rel_frac))
    f.strand = strand_str
    f.phase = '.'
    f.attrs = {'expr': '%.3f' % expr,
               'rel_frac': '%.5f' % rel_frac,
               'abs_frac': '%.5f' % abs_frac}
    f.attrs.update(attr_dict)
    yield f
    for e in exons:
        f = GTF.Feature()
        f.seqid = chrom
        f.source = 'taco'
        f.feature = 'exon'
        f.start = e.start
        f.end = e.end
        f.score = int(round(1000.0 * rel_frac))
        f.strand = strand_str
        f.phase = '.'
        f.attrs = {}
        f.attrs.update(attr_dict)
        yield f