Exemplo n.º 1
0
def test_introns():
    t = Transfrag(chrom='chrTest', strand=Strand.POS,
                  exons=[Exon(0, 10), Exon(20, 30), Exon(40, 50)])
    introns = list(t.iterintrons())
    assert len(introns) == 2
    assert introns[0] == (10, 20)
    assert introns[1] == (30, 40)
Exemplo n.º 2
0
 def make_ramp(strand, sign=1):
     transfrags = []
     chrom = 'chr1'
     start = 1000
     end = 1220
     change_expr = 0.0
     base_expr = 0.0
     # "flat" part of expression landscape
     expr = 1.0
     for i in xrange(0, 50):
         t = Transfrag(chrom=chrom, strand=strand,
                       _id='T1.%d' % i, sample_id='S%d' % i,
                       expr=expr, is_ref=False,
                       exons=[Exon(start, end)])
         transfrags.append(t)
         change_expr += expr
         base_expr += expr
     # "changing" area
     i = 0
     expr = 10.0
     for pos in range(1100, 1120):
         left, right = (start, pos) if sign < 0 else (pos, end)
         t = Transfrag(chrom=chrom, strand=strand,
                       _id='T2.%d' % i, sample_id='S%d' % i,
                       expr=expr, is_ref=False,
                       exons=[Exon(left, right)])
         transfrags.append(t)
         change_expr += expr
         i += 1
     return chrom, start, end, strand, change_expr, base_expr, transfrags
Exemplo n.º 3
0
def test_splices():
    t = Transfrag(chrom='chrTest',
                  strand=Strand.POS,
                  exons=[Exon(0, 10), Exon(20, 30),
                         Exon(40, 50)])
    splice_sites = frozenset(t.itersplices())
    assert len(splice_sites) == 4
    assert splice_sites == frozenset((10, 20, 30, 40))
Exemplo n.º 4
0
def test_introns():
    t = Transfrag(chrom='chrTest',
                  strand=Strand.POS,
                  exons=[Exon(0, 10), Exon(20, 30),
                         Exon(40, 50)])
    introns = list(t.iterintrons())
    assert len(introns) == 2
    assert introns[0] == (10, 20)
    assert introns[1] == (30, 40)
Exemplo n.º 5
0
def read_single_locus(filename, guided_strand=False):
    loci = read_gtf(filename)
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    t_dict = Transfrag.parse_gtf(gtf_lines)
    locus = Locus.create(t_dict.values(), guided_strand=guided_strand)
    return t_dict, locus
Exemplo n.º 6
0
def parse_gtf_locus(locus, gtf_fileh):
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    logging.debug('%s locus: %s features: %d' %
                  (genome_id_str, locus.name, locus.num_lines))
    # fast-forward to 'filepos'
    gtf_fileh.seek(locus.filepos)
    # read 'num_lines' lines from file and parse into transfrag objects
    t_dict = Transfrag.parse_gtf(next(gtf_fileh)
                                 for x in xrange(locus.num_lines))
    return t_dict.values()
Exemplo n.º 7
0
def parse_gtf_locus(locus, gtf_fileh):
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    logging.debug('%s locus: %s features: %d' %
                  (genome_id_str, locus.name, locus.num_lines))
    # fast-forward to 'filepos'
    gtf_fileh.seek(locus.filepos)
    # read 'num_lines' lines from file and parse into transfrag objects
    t_dict = Transfrag.parse_gtf(
        next(gtf_fileh) for x in xrange(locus.num_lines))
    return t_dict.values()
Exemplo n.º 8
0
def parse_locus(locus, fh):
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    logging.debug('LocusIndex: %s coords: %s transfrags: %d' %
                  (locus.name, genome_id_str, locus.num_lines))
    # fast-forward to 'filepos'
    fh.seek(locus.filepos)
    # parse 'num_lines' from file into Transfrag objects
    transfrags = []
    for i in xrange(locus.num_lines):
        transfrags.append(Transfrag.from_bed(fh.next()))
    return transfrags
Exemplo n.º 9
0
def parse_locus(locus, fh):
    genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end)
    logging.debug('LocusIndex: %s coords: %s transfrags: %d' %
                  (locus.name, genome_id_str, locus.num_lines))
    # fast-forward to 'filepos'
    fh.seek(locus.filepos)
    # parse 'num_lines' from file into Transfrag objects
    transfrags = []
    for i in xrange(locus.num_lines):
        transfrags.append(Transfrag.from_bed(fh.next()))
    return transfrags
Exemplo n.º 10
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('genome_fasta_file')
    parser.add_argument('bed_file')
    args = parser.parse_args()

    # check args
    if not os.path.exists(args.genome_fasta_file):
        parser.error('genome fasta file %s not found' % args.genome_fasta_file)
    if not os.path.exists(args.bed_file):
        parser.error('bed file %s not found' % args.bed_file)
    logging.info('genome fasta file: %s' % args.genome_fasta_file)
    logging.info('bed file: %s' % args.bed_file)

    # process bed file to get junctions
    logging.info('Reading Junctions')
    splice_juncs = set()
    fasta_fh = FastaFile(args.genome_fasta_file)
    with open(args.bed_file) as bed_fh:
        for line in bed_fh:
            t = Transfrag.from_bed(line)
            if t.chrom not in fasta_fh:
                continue
            for start, end in t.iterintrons():
                splice_juncs.add((t.chrom, start, end, t.strand))
    logging.info('Read %d Junctions' % (len(splice_juncs)))

    logging.info('Profiling Splice Motifs')
    motif_counter = Counter()
    for chrom, start, end, strand in splice_juncs:
        s = fasta_fh.fetch(chrom, start, start + 2)
        s += fasta_fh.fetch(chrom, end - 2, end)
        if strand == Strand.NEG:
            s = dna_reverse_complement(s)
        motif_counter[s] += 1
    fasta_fh.close()

    # report statistics
    total = sum(motif_counter.values())
    print '\t'.join(['motif', 'count', 'frac'])
    for motif, count in motif_counter.most_common():
        print '\t'.join([motif, str(count), str(float(count) / total)])
    logging.info('Done')
Exemplo n.º 11
0
def test_split_transfrag():
    loci = read_gtf("splice_sites.gtf")
    interval, gtf_lines = loci[0]
    t_dict = Transfrag.parse_gtf(gtf_lines)
    sg = SpliceGraph.create(t_dict.values())
    boundaries = array("i", sg._find_node_boundaries())
    # check nodes
    t = t_dict["A"]
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((10, 100), (200, 250), (250, 300), (400, 525))
    t = t_dict["B"]
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((10, 100), (250, 300), (400, 525))
    t = t_dict["C"]
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((150, 200), (200, 250), (250, 300), (400, 525))
    t = t_dict["D"]
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((375, 400), (400, 525))
Exemplo n.º 12
0
def test_split_transfrag():
    loci = read_gtf('splice_sites.gtf')
    interval, gtf_lines = loci[0]
    t_dict = Transfrag.parse_gtf(gtf_lines)
    sg = SpliceGraph.create(t_dict.values())
    boundaries = array('i', sg._find_node_boundaries())
    # check nodes
    t = t_dict['A']
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((10, 100), (200, 250), (250, 300), (400, 525))
    t = t_dict['B']
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((10, 100), (250, 300), (400, 525))
    t = t_dict['C']
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((150, 200), (200, 250), (250, 300), (400, 525))
    t = t_dict['D']
    nodes = tuple(split_transfrag(t, boundaries))
    assert nodes == ((375, 400), (400, 525))
Exemplo n.º 13
0
def test_create_locus():
    loci = read_gtf('splice_sites.gtf')
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    assert interval == ('chr1', 10, 525)
    t_dict = Transfrag.parse_gtf(gtf_lines)

    locus = Locus.create(t_dict.values())
    assert locus.chrom == 'chr1'
    assert locus.start == 10
    assert locus.end == 525
    a = locus.get_expr_data(49, 51, Strand.POS)
    assert np.array_equal(a, [1.0, 2.0])
    a = locus.get_expr_data(150, 151, Strand.POS)
    assert np.array_equal(a, [1.0])
    a = locus.get_expr_data(499, 501, Strand.POS)
    assert np.array_equal(a, [3.0, 1.0])
    with pytest.raises(TacoError):
        locus.get_expr_data(5, 15, Strand.POS)
    with pytest.raises(TacoError):
        locus.get_expr_data(520, 530, Strand.POS)
Exemplo n.º 14
0
def test_multi_strand1():
    # read gtf and test basic values
    loci = read_gtf("multi_strand1.gtf")
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    assert interval == ("chr1", 100, 1000)
    t_dict = Transfrag.parse_gtf(gtf_lines)
    assert len(t_dict) == 5
    locus = Locus.create(t_dict.values())
    assert locus.chrom == "chr1"
    assert locus.start == 100
    assert locus.end == 1000
    # raise exception when creating with multiple strands
    with pytest.raises(TacoError):
        SpliceGraph.create(t_dict.values())
    transfrags_pos = locus.get_transfrags(Strand.POS)
    transfrags_neg = locus.get_transfrags(Strand.NEG)
    sgpos = SpliceGraph.create(transfrags_pos)
    sgneg = SpliceGraph.create(transfrags_neg)

    # test
    assert sgpos.chrom == "chr1"
    assert sgpos.start == 100
    assert sgpos.end == 650
    assert sgpos.strand == Strand.POS
    assert sgpos.ref_start_sites == [150]
    assert sgpos.ref_stop_sites == [600]
    with pytest.raises(TacoError):
        sgpos.get_expr_data(90, 110)
    with pytest.raises(TacoError):
        sgpos.get_expr_data(650, 655)
    assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5))

    assert sgneg.chrom == "chr1"
    assert sgneg.start == 350
    assert sgneg.end == 1000
    assert sgneg.strand == Strand.NEG
    assert sgneg.ref_start_sites == [1000]
    assert sgneg.ref_stop_sites == [350]
    with pytest.raises(TacoError):
        sgneg.get_expr_data(340, 350)
    with pytest.raises(TacoError):
        sgneg.get_expr_data(1000, 1010)
    assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5))
    assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5))

    # test locus boundaries
    bpos = tuple(sgpos._find_node_boundaries())
    assert bpos == tuple((100, 200, 300, 400, 650))
    bneg = tuple(sgneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 950, 980, 1000))

    # added guided ends/assembly to use boundaries from reference
    lpos = SpliceGraph.create(transfrags_pos, guided_ends=True, guided_assembly=True)
    bpos = tuple(lpos._find_node_boundaries())
    assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650))

    lneg = SpliceGraph.create(transfrags_neg, guided_ends=True, guided_assembly=True)
    bneg = tuple(lneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))
Exemplo n.º 15
0
def test_splices():
    t = Transfrag(chrom='chrTest', strand=Strand.POS,
                  exons=[Exon(0, 10), Exon(20, 30), Exon(40, 50)])
    splice_sites = frozenset(t.itersplices())
    assert len(splice_sites) == 4
    assert splice_sites == frozenset((10, 20, 30, 40))
Exemplo n.º 16
0
def test_multi_strand1():
    # read gtf and test basic values
    loci = read_gtf('multi_strand1.gtf')
    assert len(loci) == 1
    interval, gtf_lines = loci[0]
    assert interval == ('chr1', 100, 1000)
    t_dict = Transfrag.parse_gtf(gtf_lines)
    assert len(t_dict) == 5
    locus = Locus.create(t_dict.values())
    assert locus.chrom == 'chr1'
    assert locus.start == 100
    assert locus.end == 1000
    # raise exception when creating with multiple strands
    with pytest.raises(TacoError):
        SpliceGraph.create(t_dict.values())
    transfrags_pos = locus.get_transfrags(Strand.POS)
    transfrags_neg = locus.get_transfrags(Strand.NEG)
    sgpos = SpliceGraph.create(transfrags_pos)
    sgneg = SpliceGraph.create(transfrags_neg)

    # test
    assert sgpos.chrom == 'chr1'
    assert sgpos.start == 100
    assert sgpos.end == 650
    assert sgpos.strand == Strand.POS
    assert sgpos.ref_start_sites == [150]
    assert sgpos.ref_stop_sites == [600]
    with pytest.raises(TacoError):
        sgpos.get_expr_data(90, 110)
    with pytest.raises(TacoError):
        sgpos.get_expr_data(650, 655)
    assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5))

    assert sgneg.chrom == 'chr1'
    assert sgneg.start == 350
    assert sgneg.end == 1000
    assert sgneg.strand == Strand.NEG
    assert sgneg.ref_start_sites == [1000]
    assert sgneg.ref_stop_sites == [350]
    with pytest.raises(TacoError):
        sgneg.get_expr_data(340, 350)
    with pytest.raises(TacoError):
        sgneg.get_expr_data(1000, 1010)
    assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5))
    assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5))
    assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5))

    # test locus boundaries
    bpos = tuple(sgpos._find_node_boundaries())
    assert bpos == tuple((100, 200, 300, 400, 650))
    bneg = tuple(sgneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 950, 980, 1000))

    # added guided ends/assembly to use boundaries from reference
    lpos = SpliceGraph.create(transfrags_pos,
                              guided_ends=True,
                              guided_assembly=True)
    bpos = tuple(lpos._find_node_boundaries())
    assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650))

    lneg = SpliceGraph.create(transfrags_neg,
                              guided_ends=True,
                              guided_assembly=True)
    bneg = tuple(lneg._find_node_boundaries())
    assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))