def open_trackfactory(parser, options): tf = TrackFactory(options.file, "r+") if tf.has_track(options.name): tf.close() parser.error("trackfactory '%s' already has track named '%s'" % (options.file, options.name)) return tf
def create_trackfactory(parser, options): if options.file is None: parser.error("no filename specified") if os.path.exists(options.file): parser.error("file %s exists, cannot overwrite" % (options.file)) # get references if options.refs_type == "chrom_sizes": if not os.path.exists(options.refs): parser.error("references file %s not found" % (options.refs)) refs = [tuple(line.strip().split(None,2)) for line in open(options.refs)] elif options.refs_type == "sam": if not os.path.exists(options.refs): parser.error("references file %s not found" % (options.refs)) refs = get_refs_from_sam(options.refs) elif options.refs_type == "bam": if not os.path.exists(options.refs): parser.error("references file %s not found" % (options.refs)) refs = get_refs_from_bam(options.refs) elif options.refs_type == "bowtie_index": if not check_executable("bowtie-inspect"): parser.error("'bowtie-inspect' executable not found") refs = get_refs_from_bowtie_index(options.refs) tf = TrackFactory(options.file, 'w', refs=refs) tf.close() logging.info("created trackfactory %s using refs from %s (%s)" % (options.file, options.refs, options.refs_type))
def list_tracks(parser, options): if options.file is None: parser.error("no filename specified") if not os.path.exists(options.file): parser.error("file '%s' not found" % (options.file)) tf = TrackFactory(options.file, "r") print '\t'.join(["track_name", "track_type"]) print '\t'.join(["==========", "=========="]) for t in tf: print '\t'.join(t) tf.close()
def setUp(self): filename = mktemp(prefix="tmp", suffix=".h5") self.filename = filename self.length = 100 self.isize_max = 10 self.refs = (('gene1', self.length), ('gene2', 10)) self.tf = TrackFactory(self.filename, 'w', refs=self.refs)
def remove_track(parser, options): tf = TrackFactory(options.file, "r+") if not tf.has_track(options.name): tf.close() parser.error("trackfactory '%s' does not contain track '%s'" % (options.file, options.name)) tf.delete_track(options.name) tf.close() logging.info("removed track '%s' from trackfactory '%s'" % (options.name, options.file))
def view_track(parser, options): tf = TrackFactory(options.file, "r") if not tf.has_track(options.name): tf.close() parser.error("trackfactory '%s' does not contain track '%s'" % (options.file, options.name)) region = parse_interval(options.region) t = tf.get_track(options.name) track_type = t.get_type() logging.debug("opened track '%s' type '%s'" % (options.name, track_type)) if track_type == SequenceTrack.__name__: print t[region] if track_type == ArrayTrack.__name__: if options.file_type == "bedgraph": t.tobedgraph(region, sys.stdout) else: print t[region] elif track_type == VectorTrack.__name__: if options.file_type == "bedgraph": readnum = options.readnum allele = options.allele t.tobedgraph(region, sys.stdout, norm=True, read=readnum, allele=allele) else: print t[region] elif track_type == RnaseqTrack.__name__: cov_track = t.get_coverage_track() print cov_track.density(region) junc_track = t.get_junction_track() print junc_track[region] logging.debug("done") tf.close()
class TestVectorTrack(unittest.TestCase): def setUp(self): filename = mktemp(prefix="tmp", suffix=".h5") self.filename = filename self.length = 100 self.isize_max = 10 self.refs = (('gene1', self.length), ('gene2', 10)) self.tf = TrackFactory(self.filename, 'w', refs=self.refs) def tearDown(self): self.tf.close() if os.path.exists(self.filename): os.remove(self.filename) def test_get_channel_dict(self): # default d = get_channel_dict(is_pe=False, is_strand=False, is_allele=False) for v in d.values(): self.assertEqual(v, (0,)) # paired end d = get_channel_dict(is_pe=True, is_strand=False, is_allele=False) self.assertEqual(d[(0,NO_STRAND,"A")], (0,)) self.assertEqual(d[(0,POS_STRAND,"G")], (0,)) self.assertEqual(d[(1,NEG_STRAND,"C")], (1,)) self.assertEqual(d[(1,POS_STRAND,"T")], (1,)) self.assertEqual(d[(None,NEG_STRAND,"N")], (0,1)) # strand d = get_channel_dict(is_pe=False, is_strand=True, is_allele=False) self.assertEqual(d[(0,NO_STRAND,"A")], (0,1)) self.assertEqual(d[(1,POS_STRAND,"G")], (0,)) self.assertEqual(d[(0,NEG_STRAND,"C")], (1,)) # allele d = get_channel_dict(is_pe=False, is_strand=False, is_allele=True) self.assertEqual(d[(0,NO_STRAND,"A")], (0,)) self.assertEqual(d[(1,POS_STRAND,"G")], (1,)) self.assertEqual(d[(0,NEG_STRAND,"C")], (2,)) self.assertEqual(d[(0,NEG_STRAND,"T")], (3,)) self.assertEqual(d[(0,NO_STRAND,"N")], (0,1,2,3,)) # pe/strand d = get_channel_dict(is_pe=True, is_strand=True, is_allele=False) self.assertEqual(d[(None,NO_STRAND,"A")], (0,1,2,3)) self.assertEqual(d[(1,POS_STRAND,"G")], (1,)) self.assertEqual(d[(0,NEG_STRAND,"C")], (2,)) self.assertEqual(d[(1,NEG_STRAND,"T")], (3,)) self.assertEqual(d[(None,NEG_STRAND,"N")], (2,3)) # pe/allele d = get_channel_dict(is_pe=True, is_strand=False, is_allele=True) self.assertEqual(d[(0,NO_STRAND,"A")], (0,)) self.assertEqual(d[(0,POS_STRAND,"G")], (2,)) self.assertEqual(d[(1,NEG_STRAND,"C")], (5,)) self.assertEqual(d[(1,POS_STRAND,"T")], (7,)) self.assertEqual(d[(None,NEG_STRAND,"N")], (0,1,2,3,4,5,6,7)) # strand/allele d = get_channel_dict(is_pe=False, is_strand=True, is_allele=True) self.assertEqual(d[(None,NO_STRAND,"A")], (0,1)) self.assertEqual(d[(1,POS_STRAND,"G")], (2,)) self.assertEqual(d[(0,NEG_STRAND,"C")], (5,)) self.assertEqual(d[(1,NEG_STRAND,"T")], (7,)) self.assertEqual(d[(None,NEG_STRAND,"N")], (1,3,5,7)) # pe/strand/allele d = get_channel_dict(is_pe=True, is_strand=True, is_allele=True) self.assertEqual(d[(None,NO_STRAND,"A")], (0,1,2,3)) self.assertEqual(d[(1,POS_STRAND,"G")], (5,)) self.assertEqual(d[(0,NEG_STRAND,"C")], (10,)) self.assertEqual(d[(1,NO_STRAND,"T")], (13,15)) self.assertEqual(d[(None,NO_STRAND,"N")], tuple(range(0,16))) self.assertEqual(d[(None,NEG_STRAND,"N")], (2,3,6,7,10,11,14,15)) self.assertEqual(d[(1,POS_STRAND,"N")], (1,5,9,13)) def test_fromintervals(self): dtype = "i" intervals1, correct1 = random_intervals(1000, self.length, self.isize_max, dtype) # try one channel array t = self.tf.create_track("a", VectorTrack, dtype, pe=False, strand=False, allele=False) t.fromintervals(iter(intervals1)) self.assertTrue(np.all(t["gene1"][:,0] == correct1)) # try three channel array intervals2, correct2 = random_intervals(1000, self.length, self.isize_max, dtype) intervals3, correct3 = random_intervals(1000, self.length, self.isize_max, dtype) t = self.tf.create_track("b", VectorTrack, dtype) t.fromintervals(iter(intervals1)) self.assertTrue(np.all(t["gene1"][:,0] == correct1)) self.assertFalse(np.all(t["gene1"][:,0] == correct2)) self.assertFalse(np.all(t["gene1"][:,0] == correct3)) #t.fromintervals(iter(intervals2), channel=1) #t.fromintervals(iter(intervals3), channel=2) #self.assertTrue(np.all(t["gene1"][:,1] == correct2)) #self.assertFalse(np.all(t["gene1"][:,1] == correct3)) #self.assertTrue(np.all(t["gene1"][:,2] == correct3)) #self.assertFalse(np.all(t["gene1"][:,2] == correct1)) def test_stranded_intervals(self): """testing allocating coverage to both strands""" dtype = "i4" intervals1, correct1 = random_stranded_intervals(100, self.length, self.isize_max, dtype) total_cov = correct1.sum() t = self.tf.create_track("a", VectorTrack, strand=True) # test loading from intervals t.fromintervals(iter(intervals1)) self.assertTrue(np.all(t["gene1"] == correct1)) # test count function intervals2, correct2 = random_intervals(1000, self.length, self.isize_max, dtype) for ival in intervals2: ref = ival.ref start = ival.start end = ival.end strand = ival.strand val = ival.value # check plus strand # count mycount = t.count((ref, start, end, POS_STRAND, val)) correctcount = correct1[start:end,0].sum() self.assertAlmostEqual(mycount, correctcount) # coverage mycov = t.coverage((ref, start, end, POS_STRAND, val), multiplier=1.0) correctcov = correct1[start:end,0] / float(total_cov) self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4)) # density mydens = t.density((ref, start, end, POS_STRAND, val), multiplier=1.0) correctdens = correctcount / float(total_cov * (end - start)) self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4)) # check minus strand # count mycount = t.count((ref, start, end, NEG_STRAND, val)) correctcount = correct1[start:end,1].sum() self.assertAlmostEqual(mycount, correctcount) # coverage mycov = t.coverage((ref, start, end, NEG_STRAND, val), multiplier=1.0) correctcov = correct1[start:end,1] / float(total_cov) self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4)) # density mydens = t.density((ref, start, end, NEG_STRAND, val), multiplier=1.0) correctdens = correctcount / float(total_cov * (end - start)) self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4)) # check both strands mycount = t.count((ref, start, end, NO_STRAND, val)) correctcount = correct1[start:end].sum() self.assertAlmostEqual(mycount, correctcount) # cov mycov = t.coverage((ref, start, end, NO_STRAND, val), multiplier=1.0) correctcov = correct1[start:end].sum(axis=1) / float(total_cov) self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4)) # density mydens = t.density((ref, start, end, NO_STRAND, val), multiplier=1.0) correctdens = correctcount / float(total_cov * (end - start)) self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4)) def test_stranded_allele_intervals(self): """testing coverage with allele frequencies""" dtype = "f" channel_dict = get_channel_dict(False, True, True) pos_strand_channels = channel_dict[(None,POS_STRAND,None)] neg_strand_channels = channel_dict[(None,NEG_STRAND,None)] intervals1, correct1 = \ random_stranded_allele_intervals(100, self.length, self.isize_max, dtype) total_cov = correct1.sum() t = self.tf.create_track("a", VectorTrack, strand=True, allele=True) # test loading from intervals t.fromintervals(iter(intervals1)) self.assertTrue(np.all(t["gene1"] == correct1)) # test count function intervals2, correct2 = random_intervals(10, self.length, self.isize_max, dtype) for ival in intervals2: ref = ival.ref start = ival.start end = ival.end strand = ival.strand val = ival.value # check plus strand mycount = t.count((ref, start, end, POS_STRAND, val)) correctcount = correct1[start:end,pos_strand_channels].sum() self.assertAlmostEqual(mycount, correctcount) mycov = t.coverage((ref, start, end, POS_STRAND, val), multiplier=1.0) correctcov = correct1[start:end,pos_strand_channels].sum(axis=1) / float(total_cov) self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4)) mydens = t.density((ref, start, end, POS_STRAND, val), multiplier=1.0) correctdens = correctcount / float(total_cov * (end - start)) self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4)) # check minus strand mycount = t.count((ref, start, end, NEG_STRAND, val)) correctcount = correct1[start:end,neg_strand_channels].sum() self.assertAlmostEqual(mycount, correctcount) mycov = t.coverage((ref, start, end, NEG_STRAND, val), multiplier=1.0) correctcov = correct1[start:end,neg_strand_channels].sum(axis=1) / float(total_cov) self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4)) mydens = t.density((ref, start, end, NEG_STRAND, val), multiplier=1.0) correctdens = correctcount / float(total_cov * (end - start)) self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4)) # check both strands mycount = t.count((ref, start, end, NO_STRAND, val)) correctcount = correct1[start:end].sum() self.assertAlmostEqual(mycount, correctcount) mycov = t.coverage((ref, start, end, NO_STRAND, val), multiplier=1.0) correctcov = correct1[start:end].sum(axis=1) / float(total_cov) self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4)) mydens = t.density((ref, start, end, NO_STRAND, val), multiplier=1.0) correctdens = correctcount / float(total_cov * (end - start)) self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))
def main(): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s") parser = argparse.ArgumentParser() parser.add_argument("--stranded", dest="stranded", action="store_true", default=False) parser.add_argument("--ambiguous", dest="ambiguous", action="store_true", default=False) parser.add_argument("--aliases", dest="alias_file", default=None) parser.add_argument("bed") parser.add_argument("track_files", nargs="+") options = parser.parse_args() alias_dict = {} alias_header = [] if options.alias_file is not None: for line in open(options.alias_file): if line.startswith("#"): alias_header = line.strip()[1:].split('\t') continue fields = line.strip().split('\t') alias_dict[fields[0]] = fields[1:] header_fields = alias_header + ["gene_name", "gene_interval", "gene_length"] tracks = [] for track_path in options.track_files: name, path = track_path.split("@") file_path, h5_path = path.split(":") tf = TrackFactory(file_path, "r") t = tf.get_track(h5_path) tracks.append((name, tf, t, set(t.get_rnames()))) if options.stranded: header_fields.append("%s_sense" % name) header_fields.append("%s_antisense" % name) else: header_fields.append(name) # output header print '\t'.join(map(str, header_fields)) # read genes if options.ambiguous: genes = list(BedGene.parse(open(options.bed))) else: genes = filter_strand_conflicts(options.bed) # get counts for g in genes: alias_fields = alias_dict.get(g.name, ["None"] * len(alias_header)) fields = ([g.name] + alias_fields + ["%s[%s]:%d-%d" % (g.chrom, g.strand, g.tx_start, g.tx_end), sum((end-start) for start,end in g.exons)]) sense_strand = NEG_STRAND if g.strand == "+" else POS_STRAND antisense_strand = int(not sense_strand) rname_found = False for name, tf, t, rnames in tracks: if g.chrom not in rnames: continue rname_found = True if options.stranded: sense_count = 0 antisense_count = 0 for start, end in g.exons: sense_count += t.count((g.chrom, start, end, sense_strand)) antisense_count += t.count((g.chrom, start, end, antisense_strand)) fields.append(sense_count) fields.append(antisense_count) else: count = 0 for start, end in g.exons: count += t.count((g.chrom, start, end)) fields.append(count) if rname_found: print '\t'.join(map(str, fields)) for name,tf,t,rnames in tracks: tf.close()