示例#1
0
def open_trackfactory(parser, options):
    tf = TrackFactory(options.file, "r+")
    if tf.has_track(options.name):
        tf.close()
        parser.error("trackfactory '%s' already has track named '%s'" %
                     (options.file, options.name))
    return tf
示例#2
0
def create_trackfactory(parser, options):
    if options.file is None:
        parser.error("no filename specified")
    if os.path.exists(options.file):
        parser.error("file %s exists, cannot overwrite" % (options.file))
    # get references
    if options.refs_type == "chrom_sizes":
        if not os.path.exists(options.refs):
            parser.error("references file %s not found" % (options.refs))        
        refs = [tuple(line.strip().split(None,2)) for line in open(options.refs)]
    elif options.refs_type == "sam":
        if not os.path.exists(options.refs):
            parser.error("references file %s not found" % (options.refs))
        refs = get_refs_from_sam(options.refs)
    elif options.refs_type == "bam":
        if not os.path.exists(options.refs):
            parser.error("references file %s not found" % (options.refs))
        refs = get_refs_from_bam(options.refs)
    elif options.refs_type == "bowtie_index":
        if not check_executable("bowtie-inspect"):
            parser.error("'bowtie-inspect' executable not found")        
        refs = get_refs_from_bowtie_index(options.refs)
    tf = TrackFactory(options.file, 'w', refs=refs) 
    tf.close()
    logging.info("created trackfactory %s using refs from %s (%s)" % 
                 (options.file, options.refs, options.refs_type))
示例#3
0
def list_tracks(parser, options):
    if options.file is None:
        parser.error("no filename specified")
    if not os.path.exists(options.file):
        parser.error("file '%s' not found" % (options.file))
    tf = TrackFactory(options.file, "r")
    print '\t'.join(["track_name", "track_type"])
    print '\t'.join(["==========", "=========="])
    for t in tf:
        print '\t'.join(t)
    tf.close()
示例#4
0
 def setUp(self):
     filename = mktemp(prefix="tmp", suffix=".h5")
     self.filename = filename
     self.length = 100
     self.isize_max = 10
     self.refs = (('gene1', self.length), ('gene2', 10))
     self.tf = TrackFactory(self.filename, 'w', refs=self.refs) 
示例#5
0
def remove_track(parser, options):
    tf = TrackFactory(options.file, "r+")
    if not tf.has_track(options.name):
        tf.close()
        parser.error("trackfactory '%s' does not contain track '%s'" %
                     (options.file, options.name))
    tf.delete_track(options.name)    
    tf.close()
    logging.info("removed track '%s' from trackfactory '%s'" %
                 (options.name, options.file))
示例#6
0
def view_track(parser, options):
    tf = TrackFactory(options.file, "r")
    if not tf.has_track(options.name):
        tf.close()
        parser.error("trackfactory '%s' does not contain track '%s'" %
                     (options.file, options.name))    
    region = parse_interval(options.region)
    t = tf.get_track(options.name)
    track_type = t.get_type()
    logging.debug("opened track '%s' type '%s'" % (options.name, track_type))        
    if track_type == SequenceTrack.__name__:
        print t[region]
    if track_type == ArrayTrack.__name__:
        if options.file_type == "bedgraph":
            t.tobedgraph(region, sys.stdout)
        else:
            print t[region]
    elif track_type == VectorTrack.__name__:
        if options.file_type == "bedgraph":
            readnum = options.readnum
            allele = options.allele
            t.tobedgraph(region, sys.stdout, norm=True, 
                         read=readnum, allele=allele)
        else:
            print t[region]
    elif track_type == RnaseqTrack.__name__:
        cov_track = t.get_coverage_track()
        print cov_track.density(region)
        junc_track = t.get_junction_track()
        print junc_track[region]
    logging.debug("done")
    tf.close()
示例#7
0
class TestVectorTrack(unittest.TestCase):
    def setUp(self):
        filename = mktemp(prefix="tmp", suffix=".h5")
        self.filename = filename
        self.length = 100
        self.isize_max = 10
        self.refs = (('gene1', self.length), ('gene2', 10))
        self.tf = TrackFactory(self.filename, 'w', refs=self.refs) 

    def tearDown(self):
        self.tf.close()
        if os.path.exists(self.filename):
            os.remove(self.filename)

    def test_get_channel_dict(self):
        # default
        d = get_channel_dict(is_pe=False, is_strand=False, is_allele=False)
        for v in d.values():
            self.assertEqual(v, (0,))
        # paired end
        d = get_channel_dict(is_pe=True, is_strand=False, is_allele=False)
        self.assertEqual(d[(0,NO_STRAND,"A")], (0,))
        self.assertEqual(d[(0,POS_STRAND,"G")], (0,))
        self.assertEqual(d[(1,NEG_STRAND,"C")], (1,))
        self.assertEqual(d[(1,POS_STRAND,"T")], (1,))
        self.assertEqual(d[(None,NEG_STRAND,"N")], (0,1))
        # strand
        d = get_channel_dict(is_pe=False, is_strand=True, is_allele=False)
        self.assertEqual(d[(0,NO_STRAND,"A")], (0,1))
        self.assertEqual(d[(1,POS_STRAND,"G")], (0,))
        self.assertEqual(d[(0,NEG_STRAND,"C")], (1,))
        # allele
        d = get_channel_dict(is_pe=False, is_strand=False, is_allele=True)
        self.assertEqual(d[(0,NO_STRAND,"A")], (0,))
        self.assertEqual(d[(1,POS_STRAND,"G")], (1,))
        self.assertEqual(d[(0,NEG_STRAND,"C")], (2,))
        self.assertEqual(d[(0,NEG_STRAND,"T")], (3,))
        self.assertEqual(d[(0,NO_STRAND,"N")], (0,1,2,3,))
        # pe/strand
        d = get_channel_dict(is_pe=True, is_strand=True, is_allele=False)
        self.assertEqual(d[(None,NO_STRAND,"A")], (0,1,2,3))
        self.assertEqual(d[(1,POS_STRAND,"G")], (1,))
        self.assertEqual(d[(0,NEG_STRAND,"C")], (2,))
        self.assertEqual(d[(1,NEG_STRAND,"T")], (3,))
        self.assertEqual(d[(None,NEG_STRAND,"N")], (2,3))
        # pe/allele
        d = get_channel_dict(is_pe=True, is_strand=False, is_allele=True)
        self.assertEqual(d[(0,NO_STRAND,"A")], (0,))
        self.assertEqual(d[(0,POS_STRAND,"G")], (2,))
        self.assertEqual(d[(1,NEG_STRAND,"C")], (5,))
        self.assertEqual(d[(1,POS_STRAND,"T")], (7,))
        self.assertEqual(d[(None,NEG_STRAND,"N")], (0,1,2,3,4,5,6,7))
        # strand/allele
        d = get_channel_dict(is_pe=False, is_strand=True, is_allele=True)
        self.assertEqual(d[(None,NO_STRAND,"A")], (0,1))
        self.assertEqual(d[(1,POS_STRAND,"G")], (2,))
        self.assertEqual(d[(0,NEG_STRAND,"C")], (5,))
        self.assertEqual(d[(1,NEG_STRAND,"T")], (7,))
        self.assertEqual(d[(None,NEG_STRAND,"N")], (1,3,5,7))
        # pe/strand/allele
        d = get_channel_dict(is_pe=True, is_strand=True, is_allele=True)
        self.assertEqual(d[(None,NO_STRAND,"A")], (0,1,2,3))
        self.assertEqual(d[(1,POS_STRAND,"G")], (5,))
        self.assertEqual(d[(0,NEG_STRAND,"C")], (10,))
        self.assertEqual(d[(1,NO_STRAND,"T")], (13,15))
        self.assertEqual(d[(None,NO_STRAND,"N")], tuple(range(0,16)))
        self.assertEqual(d[(None,NEG_STRAND,"N")], (2,3,6,7,10,11,14,15))
        self.assertEqual(d[(1,POS_STRAND,"N")], (1,5,9,13))
    
    def test_fromintervals(self):
        dtype = "i"
        intervals1, correct1 = random_intervals(1000, self.length, 
                                                self.isize_max, dtype)
        # try one channel array
        t = self.tf.create_track("a", VectorTrack, dtype, 
                                 pe=False, strand=False, allele=False)
        t.fromintervals(iter(intervals1))
        self.assertTrue(np.all(t["gene1"][:,0] == correct1))
        # try three channel array
        intervals2, correct2 = random_intervals(1000, self.length, 
                                                self.isize_max, dtype)
        intervals3, correct3 = random_intervals(1000, self.length, 
                                                self.isize_max, dtype)
        t = self.tf.create_track("b", VectorTrack, dtype)
        t.fromintervals(iter(intervals1))
        self.assertTrue(np.all(t["gene1"][:,0] == correct1))
        self.assertFalse(np.all(t["gene1"][:,0] == correct2))
        self.assertFalse(np.all(t["gene1"][:,0] == correct3))
        #t.fromintervals(iter(intervals2), channel=1)
        #t.fromintervals(iter(intervals3), channel=2)
        #self.assertTrue(np.all(t["gene1"][:,1] == correct2))
        #self.assertFalse(np.all(t["gene1"][:,1] == correct3))
        #self.assertTrue(np.all(t["gene1"][:,2] == correct3))
        #self.assertFalse(np.all(t["gene1"][:,2] == correct1))

    def test_stranded_intervals(self):
        """testing allocating coverage to both strands"""
        dtype = "i4"
        intervals1, correct1 = random_stranded_intervals(100, self.length, self.isize_max, dtype)
        total_cov = correct1.sum()
        t = self.tf.create_track("a", VectorTrack, strand=True)
        # test loading from intervals
        t.fromintervals(iter(intervals1))
        self.assertTrue(np.all(t["gene1"] == correct1))
        # test count function
        intervals2, correct2 = random_intervals(1000, self.length, self.isize_max, dtype)
        for ival in intervals2:
            ref = ival.ref
            start = ival.start
            end = ival.end
            strand = ival.strand
            val = ival.value 
            # check plus strand
            # count
            mycount = t.count((ref, start, end, POS_STRAND, val))
            correctcount = correct1[start:end,0].sum()
            self.assertAlmostEqual(mycount, correctcount)
            # coverage
            mycov = t.coverage((ref, start, end, POS_STRAND, val), multiplier=1.0)
            correctcov = correct1[start:end,0] / float(total_cov)
            self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4))
            # density
            mydens = t.density((ref, start, end, POS_STRAND, val), multiplier=1.0)
            correctdens = correctcount / float(total_cov * (end - start))            
            self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))            
            # check minus strand
            # count
            mycount = t.count((ref, start, end, NEG_STRAND, val))
            correctcount = correct1[start:end,1].sum()
            self.assertAlmostEqual(mycount, correctcount)
            # coverage
            mycov = t.coverage((ref, start, end, NEG_STRAND, val), multiplier=1.0)
            correctcov = correct1[start:end,1] / float(total_cov)
            self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4))
            # density
            mydens = t.density((ref, start, end, NEG_STRAND, val), multiplier=1.0)
            correctdens = correctcount / float(total_cov * (end - start))
            self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))            
            # check both strands
            mycount = t.count((ref, start, end, NO_STRAND, val))
            correctcount = correct1[start:end].sum()
            self.assertAlmostEqual(mycount, correctcount)
            # cov
            mycov = t.coverage((ref, start, end, NO_STRAND, val), multiplier=1.0)
            correctcov = correct1[start:end].sum(axis=1) / float(total_cov)
            self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4))
            # density
            mydens = t.density((ref, start, end, NO_STRAND, val), multiplier=1.0)
            correctdens = correctcount / float(total_cov * (end - start))
            self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))            

    def test_stranded_allele_intervals(self):
        """testing coverage with allele frequencies"""
        dtype = "f"
        channel_dict = get_channel_dict(False, True, True)
        pos_strand_channels = channel_dict[(None,POS_STRAND,None)]
        neg_strand_channels = channel_dict[(None,NEG_STRAND,None)]
        intervals1, correct1 = \
            random_stranded_allele_intervals(100, self.length, 
                                             self.isize_max, dtype)
        total_cov = correct1.sum()
        t = self.tf.create_track("a", VectorTrack, strand=True, allele=True)
        # test loading from intervals
        t.fromintervals(iter(intervals1))
        self.assertTrue(np.all(t["gene1"] == correct1))
        # test count function
        intervals2, correct2 = random_intervals(10, self.length, self.isize_max, dtype)
        for ival in intervals2:
            ref = ival.ref
            start = ival.start
            end = ival.end
            strand = ival.strand
            val = ival.value
            # check plus strand
            mycount = t.count((ref, start, end, POS_STRAND, val))
            correctcount = correct1[start:end,pos_strand_channels].sum()
            self.assertAlmostEqual(mycount, correctcount)
            mycov = t.coverage((ref, start, end, POS_STRAND, val), multiplier=1.0)
            correctcov = correct1[start:end,pos_strand_channels].sum(axis=1) / float(total_cov)
            self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4))
            mydens = t.density((ref, start, end, POS_STRAND, val), multiplier=1.0)
            correctdens = correctcount / float(total_cov * (end - start))
            self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))
            # check minus strand
            mycount = t.count((ref, start, end, NEG_STRAND, val))
            correctcount = correct1[start:end,neg_strand_channels].sum()
            self.assertAlmostEqual(mycount, correctcount)
            mycov = t.coverage((ref, start, end, NEG_STRAND, val), multiplier=1.0)
            correctcov = correct1[start:end,neg_strand_channels].sum(axis=1) / float(total_cov)
            self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4))
            mydens = t.density((ref, start, end, NEG_STRAND, val), multiplier=1.0)
            correctdens = correctcount / float(total_cov * (end - start))
            self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))            
            # check both strands
            mycount = t.count((ref, start, end, NO_STRAND, val))
            correctcount = correct1[start:end].sum()
            self.assertAlmostEqual(mycount, correctcount)
            mycov = t.coverage((ref, start, end, NO_STRAND, val), multiplier=1.0)
            correctcov = correct1[start:end].sum(axis=1) / float(total_cov)
            self.assertTrue(np.allclose(mycov, correctcov, atol=1e-4))
            mydens = t.density((ref, start, end, NO_STRAND, val), multiplier=1.0)
            correctdens = correctcount / float(total_cov * (end - start))
            self.assertTrue(np.allclose(mydens, correctdens, atol=1e-4))
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--stranded", dest="stranded", action="store_true", default=False)
    parser.add_argument("--ambiguous", dest="ambiguous", action="store_true", default=False)
    parser.add_argument("--aliases", dest="alias_file", default=None)
    parser.add_argument("bed")
    parser.add_argument("track_files", nargs="+")
    options = parser.parse_args()
    
    alias_dict = {}
    alias_header = []
    if options.alias_file is not None:
        for line in open(options.alias_file):
            if line.startswith("#"):
                alias_header = line.strip()[1:].split('\t')
                continue    
            fields = line.strip().split('\t')
            alias_dict[fields[0]] = fields[1:]
    
    header_fields = alias_header + ["gene_name", "gene_interval", "gene_length"]
    tracks = []
    for track_path in options.track_files:
        name, path = track_path.split("@")
        file_path, h5_path = path.split(":")
        tf = TrackFactory(file_path, "r")
        t = tf.get_track(h5_path)
        tracks.append((name, tf, t, set(t.get_rnames())))
        if options.stranded:
            header_fields.append("%s_sense" % name)
            header_fields.append("%s_antisense" % name)
        else:
            header_fields.append(name)
    # output header
    print '\t'.join(map(str, header_fields))

    # read genes
    if options.ambiguous:
        genes = list(BedGene.parse(open(options.bed)))
    else:
        genes = filter_strand_conflicts(options.bed)
    # get counts
    for g in genes:
        alias_fields = alias_dict.get(g.name, ["None"] * len(alias_header))
        fields = ([g.name] + alias_fields +
                  ["%s[%s]:%d-%d" % (g.chrom, g.strand, g.tx_start, g.tx_end),
                   sum((end-start) for start,end in g.exons)])
        sense_strand = NEG_STRAND if g.strand == "+" else POS_STRAND
        antisense_strand = int(not sense_strand)
        rname_found = False
        for name, tf, t, rnames in tracks:
            if g.chrom not in rnames:
                continue
            rname_found = True        
            if options.stranded:
                sense_count = 0
                antisense_count = 0
                for start, end in g.exons:
                    sense_count += t.count((g.chrom, start, end, sense_strand))
                    antisense_count += t.count((g.chrom, start, end, antisense_strand))
                fields.append(sense_count)
                fields.append(antisense_count)
            else:
                count = 0
                for start, end in g.exons:
                    count += t.count((g.chrom, start, end))
                fields.append(count)
        if rname_found:
            print '\t'.join(map(str, fields))

    for name,tf,t,rnames in tracks:
        tf.close()