def assemble_locus(locus, transfrags, config): # filter transfrags by expression and transfrag length genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) locus_id = locus.name num_transfrags = len(transfrags) transfrags = filter_transfrags(transfrags, config.min_transfrag_length, config.min_expr, config.transfrags_filtered_gtf_fh) num_filtered_transfrags = len(transfrags) logging.debug('%s locus: %s transfrags: %d filtered: %d' % (genome_id_str, locus.name, num_transfrags, num_filtered_transfrags)) if len(transfrags) == 0: return # create locus locus = Locus.create(transfrags, config.guided_strand, config.guided_ends, config.guided_assembly) genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) logging.debug('%s locus: %s transfrags: %d (+: %d, -: %d, .: %d)' % (genome_id_str, locus_id, len(transfrags), len(locus.get_transfrags(Strand.POS)), len(locus.get_transfrags(Strand.NEG)), len(locus.get_transfrags(Strand.NA)))) # resolve unstranded transcripts num_resolved = locus.impute_unknown_strands() if num_resolved > 0: logging.debug('%s locus: %s resolved: %d (+: %d, -: %d, .: %d)' % (genome_id_str, locus_id, num_resolved, len(locus.get_transfrags(Strand.POS)), len(locus.get_transfrags(Strand.NEG)), len(locus.get_transfrags(Strand.NA)))) # write bedgraph files after strand resolved locus.write_bedgraph(config.bedgraph_fhs) # write splice junctions locus.write_splice_bed(config.splice_bed_fh) # convert to stranded locus objects for sgraph in locus.create_splice_graphs(): assemble_gene(sgraph, locus_id, config)
def test_create_locus(): loci = read_gtf('splice_sites.gtf') assert len(loci) == 1 interval, gtf_lines = loci[0] assert interval == ('chr1', 10, 525) t_dict = Transfrag.parse_gtf(gtf_lines) locus = Locus.create(t_dict.values()) assert locus.chrom == 'chr1' assert locus.start == 10 assert locus.end == 525 a = locus.get_expr_data(49, 51, Strand.POS) assert np.array_equal(a, [1.0, 2.0]) a = locus.get_expr_data(150, 151, Strand.POS) assert np.array_equal(a, [1.0]) a = locus.get_expr_data(499, 501, Strand.POS) assert np.array_equal(a, [3.0, 1.0]) with pytest.raises(TacoError): locus.get_expr_data(5, 15, Strand.POS) with pytest.raises(TacoError): locus.get_expr_data(520, 530, Strand.POS)
def assemble_locus(locus_index, transfrags, config): if len(transfrags) == 0: return # create locus locus = Locus.create(transfrags, config.guided_strand, config.guided_ends, config.guided_assembly) logging.debug('%s locus: %s' % (locus, locus_index.name)) # resolve unstranded transcripts num_resolved = locus.impute_unknown_strands() if num_resolved > 0: logging.debug('%s locus: %s resolved: %d' % (locus, locus_index.name, num_resolved)) # write bedgraph files after strand resolved locus.write_bedgraph(config.bedgraph_fhs) # write splice junctions locus.write_splice_bed(config.splice_bed_fh) # convert to stranded locus objects for sgraph in locus.create_splice_graphs(): if (sgraph.strand == Strand.NA) and not config.assemble_unstranded: continue assemble_gene(sgraph, locus_index.name, config)
def assemble_locus(locus, transfrags, config): # filter transfrags by expression and transfrag length genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) locus_id = locus.name num_transfrags = len(transfrags) transfrags = filter_transfrags(transfrags, config.min_transfrag_length, config.min_expr, config.transfrags_filtered_gtf_fh) num_filtered_transfrags = len(transfrags) logging.debug( '%s locus: %s transfrags: %d filtered: %d' % (genome_id_str, locus.name, num_transfrags, num_filtered_transfrags)) if len(transfrags) == 0: return # create locus locus = Locus.create(transfrags, config.guided_strand, config.guided_ends, config.guided_assembly) genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) logging.debug('%s locus: %s transfrags: %d (+: %d, -: %d, .: %d)' % (genome_id_str, locus_id, len(transfrags), len(locus.get_transfrags( Strand.POS)), len(locus.get_transfrags( Strand.NEG)), len(locus.get_transfrags(Strand.NA)))) # resolve unstranded transcripts num_resolved = locus.impute_unknown_strands() if num_resolved > 0: logging.debug('%s locus: %s resolved: %d (+: %d, -: %d, .: %d)' % (genome_id_str, locus_id, num_resolved, len(locus.get_transfrags( Strand.POS)), len(locus.get_transfrags(Strand.NEG)), len(locus.get_transfrags(Strand.NA)))) # write bedgraph files after strand resolved locus.write_bedgraph(config.bedgraph_fhs) # write splice junctions locus.write_splice_bed(config.splice_bed_fh) # convert to stranded locus objects for sgraph in locus.create_splice_graphs(): assemble_gene(sgraph, locus_id, config)
def test_multi_strand1(): # read gtf and test basic values loci = read_gtf("multi_strand1.gtf") assert len(loci) == 1 interval, gtf_lines = loci[0] assert interval == ("chr1", 100, 1000) t_dict = Transfrag.parse_gtf(gtf_lines) assert len(t_dict) == 5 locus = Locus.create(t_dict.values()) assert locus.chrom == "chr1" assert locus.start == 100 assert locus.end == 1000 # raise exception when creating with multiple strands with pytest.raises(TacoError): SpliceGraph.create(t_dict.values()) transfrags_pos = locus.get_transfrags(Strand.POS) transfrags_neg = locus.get_transfrags(Strand.NEG) sgpos = SpliceGraph.create(transfrags_pos) sgneg = SpliceGraph.create(transfrags_neg) # test assert sgpos.chrom == "chr1" assert sgpos.start == 100 assert sgpos.end == 650 assert sgpos.strand == Strand.POS assert sgpos.ref_start_sites == [150] assert sgpos.ref_stop_sites == [600] with pytest.raises(TacoError): sgpos.get_expr_data(90, 110) with pytest.raises(TacoError): sgpos.get_expr_data(650, 655) assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5)) assert sgneg.chrom == "chr1" assert sgneg.start == 350 assert sgneg.end == 1000 assert sgneg.strand == Strand.NEG assert sgneg.ref_start_sites == [1000] assert sgneg.ref_stop_sites == [350] with pytest.raises(TacoError): sgneg.get_expr_data(340, 350) with pytest.raises(TacoError): sgneg.get_expr_data(1000, 1010) assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5)) assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5)) # test locus boundaries bpos = tuple(sgpos._find_node_boundaries()) assert bpos == tuple((100, 200, 300, 400, 650)) bneg = tuple(sgneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 950, 980, 1000)) # added guided ends/assembly to use boundaries from reference lpos = SpliceGraph.create(transfrags_pos, guided_ends=True, guided_assembly=True) bpos = tuple(lpos._find_node_boundaries()) assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650)) lneg = SpliceGraph.create(transfrags_neg, guided_ends=True, guided_assembly=True) bneg = tuple(lneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))
def test_multi_strand1(): # read gtf and test basic values loci = read_gtf('multi_strand1.gtf') assert len(loci) == 1 interval, gtf_lines = loci[0] assert interval == ('chr1', 100, 1000) t_dict = Transfrag.parse_gtf(gtf_lines) assert len(t_dict) == 5 locus = Locus.create(t_dict.values()) assert locus.chrom == 'chr1' assert locus.start == 100 assert locus.end == 1000 # raise exception when creating with multiple strands with pytest.raises(TacoError): SpliceGraph.create(t_dict.values()) transfrags_pos = locus.get_transfrags(Strand.POS) transfrags_neg = locus.get_transfrags(Strand.NEG) sgpos = SpliceGraph.create(transfrags_pos) sgneg = SpliceGraph.create(transfrags_neg) # test assert sgpos.chrom == 'chr1' assert sgpos.start == 100 assert sgpos.end == 650 assert sgpos.strand == Strand.POS assert sgpos.ref_start_sites == [150] assert sgpos.ref_stop_sites == [600] with pytest.raises(TacoError): sgpos.get_expr_data(90, 110) with pytest.raises(TacoError): sgpos.get_expr_data(650, 655) assert np.array_equal(sgpos.get_expr_data(100, 105), np.ones(5)) assert sgneg.chrom == 'chr1' assert sgneg.start == 350 assert sgneg.end == 1000 assert sgneg.strand == Strand.NEG assert sgneg.ref_start_sites == [1000] assert sgneg.ref_stop_sites == [350] with pytest.raises(TacoError): sgneg.get_expr_data(340, 350) with pytest.raises(TacoError): sgneg.get_expr_data(1000, 1010) assert np.array_equal(sgneg.get_expr_data(400, 405), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(945, 950), np.zeros(5)) assert np.array_equal(sgneg.get_expr_data(950, 955), np.ones(5)) assert np.array_equal(sgneg.get_expr_data(980, 985), np.zeros(5)) # test locus boundaries bpos = tuple(sgpos._find_node_boundaries()) assert bpos == tuple((100, 200, 300, 400, 650)) bneg = tuple(sgneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 950, 980, 1000)) # added guided ends/assembly to use boundaries from reference lpos = SpliceGraph.create(transfrags_pos, guided_ends=True, guided_assembly=True) bpos = tuple(lpos._find_node_boundaries()) assert bpos == tuple((100, 150, 200, 300, 400, 500, 600, 650)) lneg = SpliceGraph.create(transfrags_neg, guided_ends=True, guided_assembly=True) bneg = tuple(lneg._find_node_boundaries()) assert bneg == tuple((350, 400, 500, 750, 900, 950, 980, 1000))