def test_introns(): t = Transfrag(chrom='chrTest', strand=Strand.POS, exons=[Exon(0, 10), Exon(20, 30), Exon(40, 50)]) introns = list(t.iterintrons()) assert len(introns) == 2 assert introns[0] == (10, 20) assert introns[1] == (30, 40)
def make_ramp(strand, sign=1): transfrags = [] chrom = 'chr1' start = 1000 end = 1220 change_expr = 0.0 base_expr = 0.0 # "flat" part of expression landscape expr = 1.0 for i in xrange(0, 50): t = Transfrag(chrom=chrom, strand=strand, _id='T1.%d' % i, sample_id='S%d' % i, expr=expr, is_ref=False, exons=[Exon(start, end)]) transfrags.append(t) change_expr += expr base_expr += expr # "changing" area i = 0 expr = 10.0 for pos in range(1100, 1120): left, right = (start, pos) if sign < 0 else (pos, end) t = Transfrag(chrom=chrom, strand=strand, _id='T2.%d' % i, sample_id='S%d' % i, expr=expr, is_ref=False, exons=[Exon(left, right)]) transfrags.append(t) change_expr += expr i += 1 return chrom, start, end, strand, change_expr, base_expr, transfrags
def test_splices(): t = Transfrag(chrom='chrTest', strand=Strand.POS, exons=[Exon(0, 10), Exon(20, 30), Exon(40, 50)]) splice_sites = frozenset(t.itersplices()) assert len(splice_sites) == 4 assert splice_sites == frozenset((10, 20, 30, 40))
def read_single_locus(filename, guided_strand=False): loci = read_gtf(filename) assert len(loci) == 1 interval, gtf_lines = loci[0] t_dict = Transfrag.parse_gtf(gtf_lines) locus = Locus.create(t_dict.values(), guided_strand=guided_strand) return t_dict, locus
def parse_gtf_locus(locus, gtf_fileh): genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) logging.debug('%s locus: %s features: %d' % (genome_id_str, locus.name, locus.num_lines)) # fast-forward to 'filepos' gtf_fileh.seek(locus.filepos) # read 'num_lines' lines from file and parse into transfrag objects t_dict = Transfrag.parse_gtf(next(gtf_fileh) for x in xrange(locus.num_lines)) return t_dict.values()
def parse_gtf_locus(locus, gtf_fileh): genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) logging.debug('%s locus: %s features: %d' % (genome_id_str, locus.name, locus.num_lines)) # fast-forward to 'filepos' gtf_fileh.seek(locus.filepos) # read 'num_lines' lines from file and parse into transfrag objects t_dict = Transfrag.parse_gtf( next(gtf_fileh) for x in xrange(locus.num_lines)) return t_dict.values()
def parse_locus(locus, fh): genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) logging.debug('LocusIndex: %s coords: %s transfrags: %d' % (locus.name, genome_id_str, locus.num_lines)) # fast-forward to 'filepos' fh.seek(locus.filepos) # parse 'num_lines' from file into Transfrag objects transfrags = [] for i in xrange(locus.num_lines): transfrags.append(Transfrag.from_bed(fh.next())) return transfrags
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('genome_fasta_file') parser.add_argument('bed_file') args = parser.parse_args() # check args if not os.path.exists(args.genome_fasta_file): parser.error('genome fasta file %s not found' % args.genome_fasta_file) if not os.path.exists(args.bed_file): parser.error('bed file %s not found' % args.bed_file) logging.info('genome fasta file: %s' % args.genome_fasta_file) logging.info('bed file: %s' % args.bed_file) # process bed file to get junctions logging.info('Reading Junctions') splice_juncs = set() fasta_fh = FastaFile(args.genome_fasta_file) with open(args.bed_file) as bed_fh: for line in bed_fh: t = Transfrag.from_bed(line) if t.chrom not in fasta_fh: continue for start, end in t.iterintrons(): splice_juncs.add((t.chrom, start, end, t.strand)) logging.info('Read %d Junctions' % (len(splice_juncs))) logging.info('Profiling Splice Motifs') motif_counter = Counter() for chrom, start, end, strand in splice_juncs: s = fasta_fh.fetch(chrom, start, start + 2) s += fasta_fh.fetch(chrom, end - 2, end) if strand == Strand.NEG: s = dna_reverse_complement(s) motif_counter[s] += 1 fasta_fh.close() # report statistics total = sum(motif_counter.values()) print '\t'.join(['motif', 'count', 'frac']) for motif, count in motif_counter.most_common(): print '\t'.join([motif, str(count), str(float(count) / total)]) logging.info('Done')
def test_split_transfrag(): loci = read_gtf("splice_sites.gtf") interval, gtf_lines = loci[0] t_dict = Transfrag.parse_gtf(gtf_lines) sg = SpliceGraph.create(t_dict.values()) boundaries = array("i", sg._find_node_boundaries()) # check nodes t = t_dict["A"] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((10, 100), (200, 250), (250, 300), (400, 525)) t = t_dict["B"] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((10, 100), (250, 300), (400, 525)) t = t_dict["C"] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((150, 200), (200, 250), (250, 300), (400, 525)) t = t_dict["D"] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((375, 400), (400, 525))
def test_split_transfrag(): loci = read_gtf('splice_sites.gtf') interval, gtf_lines = loci[0] t_dict = Transfrag.parse_gtf(gtf_lines) sg = SpliceGraph.create(t_dict.values()) boundaries = array('i', sg._find_node_boundaries()) # check nodes t = t_dict['A'] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((10, 100), (200, 250), (250, 300), (400, 525)) t = t_dict['B'] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((10, 100), (250, 300), (400, 525)) t = t_dict['C'] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((150, 200), (200, 250), (250, 300), (400, 525)) t = t_dict['D'] nodes = tuple(split_transfrag(t, boundaries)) assert nodes == ((375, 400), (400, 525))
def test_create_locus(): loci = read_gtf('splice_sites.gtf') assert len(loci) == 1 interval, gtf_lines = loci[0] assert interval == ('chr1', 10, 525) t_dict = Transfrag.parse_gtf(gtf_lines) locus = Locus.create(t_dict.values()) assert locus.chrom == 'chr1' assert locus.start == 10 assert locus.end == 525 a = locus.get_expr_data(49, 51, Strand.POS) assert np.array_equal(a, [1.0, 2.0]) a = locus.get_expr_data(150, 151, Strand.POS) assert np.array_equal(a, [1.0]) a = locus.get_expr_data(499, 501, Strand.POS) assert np.array_equal(a, [3.0, 1.0]) with pytest.raises(TacoError): locus.get_expr_data(5, 15, Strand.POS) with pytest.raises(TacoError): locus.get_expr_data(520, 530, Strand.POS)
def test_multi_strand1(): # read gtf and test basic values loci = read_gtf("multi_strand1.gtf") assert len(loci) == 1 interval, gtf_lines = loci[0] assert interval == ("chr1", 100, 1000) t_dict = Transfrag.parse_gtf(gtf_lines) assert len(t_dict) == 5 locus = Locus.create(t_dict.values()) assert locus.chrom == "chr1" assert locus.start == 100 assert locus.end == 1000 # raise exception when creating with multiple strands with pytest.raises(TacoError): SpliceGraph.create(t_dict.values()) transfrags_pos = locus.get_transfrags(Strand.POS) transfrags_neg = locus.get_transfrags(Strand.NEG) sgpos = SpliceGraph.create(transfrags_pos) sgneg = SpliceGraph.create(transfrags_neg) # test assert sgpos.chrom == "chr1" assert sgpos.start == 100 assert sgpos.end == 650 assert sgpos.strand == Strand.POS assert sgpos.ref_start_sites == [150] assert sgpos.ref_stop_sites == [600] with pytest.raises(TacoError): sgpos.get_expr_data(90, 110) with pytest.raises(TacoError): sgpos.get_expr_data(650, 655) assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5)) assert sgneg.chrom == "chr1" assert sgneg.start == 350 assert sgneg.end == 1000 assert sgneg.strand == Strand.NEG assert sgneg.ref_start_sites == [1000] assert sgneg.ref_stop_sites == [350] with pytest.raises(TacoError): sgneg.get_expr_data(340, 350) with pytest.raises(TacoError): sgneg.get_expr_data(1000, 1010) assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5)) assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5)) # test locus boundaries bpos = tuple(sgpos._find_node_boundaries()) assert bpos == tuple((100, 200, 300, 400, 650)) bneg = tuple(sgneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 950, 980, 1000)) # added guided ends/assembly to use boundaries from reference lpos = SpliceGraph.create(transfrags_pos, guided_ends=True, guided_assembly=True) bpos = tuple(lpos._find_node_boundaries()) assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650)) lneg = SpliceGraph.create(transfrags_neg, guided_ends=True, guided_assembly=True) bneg = tuple(lneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))
def test_multi_strand1(): # read gtf and test basic values loci = read_gtf('multi_strand1.gtf') assert len(loci) == 1 interval, gtf_lines = loci[0] assert interval == ('chr1', 100, 1000) t_dict = Transfrag.parse_gtf(gtf_lines) assert len(t_dict) == 5 locus = Locus.create(t_dict.values()) assert locus.chrom == 'chr1' assert locus.start == 100 assert locus.end == 1000 # raise exception when creating with multiple strands with pytest.raises(TacoError): SpliceGraph.create(t_dict.values()) transfrags_pos = locus.get_transfrags(Strand.POS) transfrags_neg = locus.get_transfrags(Strand.NEG) sgpos = SpliceGraph.create(transfrags_pos) sgneg = SpliceGraph.create(transfrags_neg) # test assert sgpos.chrom == 'chr1' assert sgpos.start == 100 assert sgpos.end == 650 assert sgpos.strand == Strand.POS assert sgpos.ref_start_sites == [150] assert sgpos.ref_stop_sites == [600] with pytest.raises(TacoError): sgpos.get_expr_data(90, 110) with pytest.raises(TacoError): sgpos.get_expr_data(650, 655) assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5)) assert sgneg.chrom == 'chr1' assert sgneg.start == 350 assert sgneg.end == 1000 assert sgneg.strand == Strand.NEG assert sgneg.ref_start_sites == [1000] assert sgneg.ref_stop_sites == [350] with pytest.raises(TacoError): sgneg.get_expr_data(340, 350) with pytest.raises(TacoError): sgneg.get_expr_data(1000, 1010) assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5)) assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5)) # test locus boundaries bpos = tuple(sgpos._find_node_boundaries()) assert bpos == tuple((100, 200, 300, 400, 650)) bneg = tuple(sgneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 950, 980, 1000)) # added guided ends/assembly to use boundaries from reference lpos = SpliceGraph.create(transfrags_pos, guided_ends=True, guided_assembly=True) bpos = tuple(lpos._find_node_boundaries()) assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650)) lneg = SpliceGraph.create(transfrags_neg, guided_ends=True, guided_assembly=True) bneg = tuple(lneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))