示例#1
0
文件: assemble.py 项目: yniknafs/taco
def assemble_locus(locus, transfrags, config):
    # filter transfrags by expression and transfrag length
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    locus_id = locus.name
    num_transfrags = len(transfrags)
    transfrags = filter_transfrags(transfrags,
                                   config.min_transfrag_length,
                                   config.min_expr,
                                   config.transfrags_filtered_gtf_fh)
    num_filtered_transfrags = len(transfrags)
    logging.debug('%s locus: %s transfrags: %d filtered: %d' %
                  (genome_id_str, locus.name, num_transfrags,
                   num_filtered_transfrags))
    if len(transfrags) == 0:
        return
    # create locus
    locus = Locus.create(transfrags,
                         config.guided_strand,
                         config.guided_ends,
                         config.guided_assembly)
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    logging.debug('%s locus: %s transfrags: %d (+: %d, -: %d, .: %d)' %
                  (genome_id_str, locus_id, len(transfrags),
                   len(locus.get_transfrags(Strand.POS)),
                   len(locus.get_transfrags(Strand.NEG)),
                   len(locus.get_transfrags(Strand.NA))))
    # resolve unstranded transcripts
    num_resolved = locus.impute_unknown_strands()
    if num_resolved > 0:
        logging.debug('%s locus: %s resolved: %d (+: %d, -: %d, .: %d)' %
                      (genome_id_str, locus_id, num_resolved,
                       len(locus.get_transfrags(Strand.POS)),
                       len(locus.get_transfrags(Strand.NEG)),
                       len(locus.get_transfrags(Strand.NA))))
    # write bedgraph files after strand resolved
    locus.write_bedgraph(config.bedgraph_fhs)
    # write splice junctions
    locus.write_splice_bed(config.splice_bed_fh)
    # convert to stranded locus objects
    for sgraph in locus.create_splice_graphs():
        assemble_gene(sgraph, locus_id, config)
示例#2
0
def test_create_locus():
    loci = read_gtf('splice_sites.gtf')
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    assert interval == ('chr1', 10, 525)
    t_dict = Transfrag.parse_gtf(gtf_lines)

    locus = Locus.create(t_dict.values())
    assert locus.chrom == 'chr1'
    assert locus.start == 10
    assert locus.end == 525
    a = locus.get_expr_data(49, 51, Strand.POS)
    assert np.array_equal(a, [1.0, 2.0])
    a = locus.get_expr_data(150, 151, Strand.POS)
    assert np.array_equal(a, [1.0])
    a = locus.get_expr_data(499, 501, Strand.POS)
    assert np.array_equal(a, [3.0, 1.0])
    with pytest.raises(TacoError):
        locus.get_expr_data(5, 15, Strand.POS)
    with pytest.raises(TacoError):
        locus.get_expr_data(520, 530, Strand.POS)
示例#3
0
def assemble_locus(locus_index, transfrags, config):
    if len(transfrags) == 0:
        return
    # create locus
    locus = Locus.create(transfrags, config.guided_strand, config.guided_ends,
                         config.guided_assembly)
    logging.debug('%s locus: %s' % (locus, locus_index.name))
    # resolve unstranded transcripts
    num_resolved = locus.impute_unknown_strands()
    if num_resolved > 0:
        logging.debug('%s locus: %s resolved: %d' %
                      (locus, locus_index.name, num_resolved))
    # write bedgraph files after strand resolved
    locus.write_bedgraph(config.bedgraph_fhs)
    # write splice junctions
    locus.write_splice_bed(config.splice_bed_fh)
    # convert to stranded locus objects
    for sgraph in locus.create_splice_graphs():
        if (sgraph.strand == Strand.NA) and not config.assemble_unstranded:
            continue
        assemble_gene(sgraph, locus_index.name, config)
示例#4
0
def assemble_locus(locus, transfrags, config):
    # filter transfrags by expression and transfrag length
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    locus_id = locus.name
    num_transfrags = len(transfrags)
    transfrags = filter_transfrags(transfrags, config.min_transfrag_length,
                                   config.min_expr,
                                   config.transfrags_filtered_gtf_fh)
    num_filtered_transfrags = len(transfrags)
    logging.debug(
        '%s locus: %s transfrags: %d filtered: %d' %
        (genome_id_str, locus.name, num_transfrags, num_filtered_transfrags))
    if len(transfrags) == 0:
        return
    # create locus
    locus = Locus.create(transfrags, config.guided_strand, config.guided_ends,
                         config.guided_assembly)
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    logging.debug('%s locus: %s transfrags: %d (+: %d, -: %d, .: %d)' %
                  (genome_id_str, locus_id, len(transfrags),
                   len(locus.get_transfrags(
                       Strand.POS)), len(locus.get_transfrags(
                           Strand.NEG)), len(locus.get_transfrags(Strand.NA))))
    # resolve unstranded transcripts
    num_resolved = locus.impute_unknown_strands()
    if num_resolved > 0:
        logging.debug('%s locus: %s resolved: %d (+: %d, -: %d, .: %d)' %
                      (genome_id_str, locus_id, num_resolved,
                       len(locus.get_transfrags(
                           Strand.POS)), len(locus.get_transfrags(Strand.NEG)),
                       len(locus.get_transfrags(Strand.NA))))
    # write bedgraph files after strand resolved
    locus.write_bedgraph(config.bedgraph_fhs)
    # write splice junctions
    locus.write_splice_bed(config.splice_bed_fh)
    # convert to stranded locus objects
    for sgraph in locus.create_splice_graphs():
        assemble_gene(sgraph, locus_id, config)
示例#5
0
文件: assemble.py 项目: tacorna/taco
def assemble_locus(locus_index, transfrags, config):
    if len(transfrags) == 0:
        return
    # create locus
    locus = Locus.create(transfrags,
                         config.guided_strand,
                         config.guided_ends,
                         config.guided_assembly)
    logging.debug('%s locus: %s' % (locus, locus_index.name))
    # resolve unstranded transcripts
    num_resolved = locus.impute_unknown_strands()
    if num_resolved > 0:
        logging.debug('%s locus: %s resolved: %d' %
                      (locus, locus_index.name, num_resolved))
    # write bedgraph files after strand resolved
    locus.write_bedgraph(config.bedgraph_fhs)
    # write splice junctions
    locus.write_splice_bed(config.splice_bed_fh)
    # convert to stranded locus objects
    for sgraph in locus.create_splice_graphs():
        if (sgraph.strand == Strand.NA) and not config.assemble_unstranded:
            continue
        assemble_gene(sgraph, locus_index.name, config)
示例#6
0
def test_multi_strand1():
    # read gtf and test basic values
    loci = read_gtf("multi_strand1.gtf")
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    assert interval == ("chr1", 100, 1000)
    t_dict = Transfrag.parse_gtf(gtf_lines)
    assert len(t_dict) == 5
    locus = Locus.create(t_dict.values())
    assert locus.chrom == "chr1"
    assert locus.start == 100
    assert locus.end == 1000
    # raise exception when creating with multiple strands
    with pytest.raises(TacoError):
        SpliceGraph.create(t_dict.values())
    transfrags_pos = locus.get_transfrags(Strand.POS)
    transfrags_neg = locus.get_transfrags(Strand.NEG)
    sgpos = SpliceGraph.create(transfrags_pos)
    sgneg = SpliceGraph.create(transfrags_neg)

    # test
    assert sgpos.chrom == "chr1"
    assert sgpos.start == 100
    assert sgpos.end == 650
    assert sgpos.strand == Strand.POS
    assert sgpos.ref_start_sites == [150]
    assert sgpos.ref_stop_sites == [600]
    with pytest.raises(TacoError):
        sgpos.get_expr_data(90, 110)
    with pytest.raises(TacoError):
        sgpos.get_expr_data(650, 655)
    assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5))

    assert sgneg.chrom == "chr1"
    assert sgneg.start == 350
    assert sgneg.end == 1000
    assert sgneg.strand == Strand.NEG
    assert sgneg.ref_start_sites == [1000]
    assert sgneg.ref_stop_sites == [350]
    with pytest.raises(TacoError):
        sgneg.get_expr_data(340, 350)
    with pytest.raises(TacoError):
        sgneg.get_expr_data(1000, 1010)
    assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5))
    assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5))

    # test locus boundaries
    bpos = tuple(sgpos._find_node_boundaries())
    assert bpos == tuple((100, 200, 300, 400, 650))
    bneg = tuple(sgneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 950, 980, 1000))

    # added guided ends/assembly to use boundaries from reference
    lpos = SpliceGraph.create(transfrags_pos, guided_ends=True, guided_assembly=True)
    bpos = tuple(lpos._find_node_boundaries())
    assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650))

    lneg = SpliceGraph.create(transfrags_neg, guided_ends=True, guided_assembly=True)
    bneg = tuple(lneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))
示例#7
0
def test_multi_strand1():
    # read gtf and test basic values
    loci = read_gtf('multi_strand1.gtf')
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    assert interval == ('chr1', 100, 1000)
    t_dict = Transfrag.parse_gtf(gtf_lines)
    assert len(t_dict) == 5
    locus = Locus.create(t_dict.values())
    assert locus.chrom == 'chr1'
    assert locus.start == 100
    assert locus.end == 1000
    # raise exception when creating with multiple strands
    with pytest.raises(TacoError):
        SpliceGraph.create(t_dict.values())
    transfrags_pos = locus.get_transfrags(Strand.POS)
    transfrags_neg = locus.get_transfrags(Strand.NEG)
    sgpos = SpliceGraph.create(transfrags_pos)
    sgneg = SpliceGraph.create(transfrags_neg)

    # test
    assert sgpos.chrom == 'chr1'
    assert sgpos.start == 100
    assert sgpos.end == 650
    assert sgpos.strand == Strand.POS
    assert sgpos.ref_start_sites == [150]
    assert sgpos.ref_stop_sites == [600]
    with pytest.raises(TacoError):
        sgpos.get_expr_data(90, 110)
    with pytest.raises(TacoError):
        sgpos.get_expr_data(650, 655)
    assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5))

    assert sgneg.chrom == 'chr1'
    assert sgneg.start == 350
    assert sgneg.end == 1000
    assert sgneg.strand == Strand.NEG
    assert sgneg.ref_start_sites == [1000]
    assert sgneg.ref_stop_sites == [350]
    with pytest.raises(TacoError):
        sgneg.get_expr_data(340, 350)
    with pytest.raises(TacoError):
        sgneg.get_expr_data(1000, 1010)
    assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5))
    assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5))

    # test locus boundaries
    bpos = tuple(sgpos._find_node_boundaries())
    assert bpos == tuple((100, 200, 300, 400, 650))
    bneg = tuple(sgneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 950, 980, 1000))

    # added guided ends/assembly to use boundaries from reference
    lpos = SpliceGraph.create(transfrags_pos,
                              guided_ends=True,
                              guided_assembly=True)
    bpos = tuple(lpos._find_node_boundaries())
    assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650))

    lneg = SpliceGraph.create(transfrags_neg,
                              guided_ends=True,
                              guided_assembly=True)
    bneg = tuple(lneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))