Пример #1
0
def test_variants(db, variant_track):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts
    from pita.util import model_to_bed

    for tname, source, exons in read_bed_transcripts(open(variant_track)):
         db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons)
    c = DbCollection(db)

    best_model = [m for m in  c.get_connected_models()][0][0]
    cuts = [str(e) for e in c.get_node_cuts(best_model)]
    assert ["chr1:800+900", "chr1:1400+1500"] == cuts 
    
    best_variant = c.get_best_variant(best_model, [{"weight":1,"type":"length","name":"length"}])
    s = [str(s) for s in best_variant]
    assert ["chr1:100+200", "chr1:400+700", "chr1:800+900", "chr1:1000+1300", "chr1:1400+1500", "chr1:1600+1900", "chr1:2000+2100"] == s
Пример #2
0
def db_5t(db, two_transcripts, three_transcripts):
    from pita.dbcollection import DbCollection
    for name, source, exons in two_transcripts:
        db.add_transcript(name, source, exons)
    for name, source, exons in three_transcripts:
        db.add_transcript(name, source, exons)
    c = DbCollection(db, [])
    return c
Пример #3
0
def test_long_exon_filter(db, t1, t2):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts

    for tname, source, exons in read_bed_transcripts(open(t1)):
        db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons)
    for tname, source, exons in read_bed_transcripts(open(t2)):
        db.add_transcript("{0}{1}{2}".format("t2", "|", tname), source, exons)

    c = DbCollection(db, [], chrom="chr1")
    c.filter_long(l=500, evidence=1)

    models = []
    for cluster in c.get_best_variants([]):
        models.append(cluster)

    assert [3, 5] == sorted([len(m) for m in models])
Пример #4
0
def test_long_exon_filter(db, t1, t2):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts


    for tname, source, exons in read_bed_transcripts(open(t1)):
        db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons)
    for tname, source, exons in read_bed_transcripts(open(t2)):
        db.add_transcript("{0}{1}{2}".format("t2", "|", tname), source, exons)
    
    c = DbCollection(db, chrom="chr1")
    c.filter_long(evidence=1)

    models = []
    for cluster in c.get_connected_models():
        for m in cluster:
            models.append(m)
    
    assert [1,3,5] == sorted([len(m) for m in models])
Пример #5
0
def collection(db):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts

    bed = "tests/data/scaffold_54_genes.bed"
    for tname, source, exons in read_bed_transcripts(open(bed), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source,
                          exons)

    mc = DbCollection(db, [])
    return mc
Пример #6
0
def get_chrom_models(conn,
                     chrom,
                     weight,
                     repeats=None,
                     prune=None,
                     keep=None,
                     filter_ev=None,
                     experimental=None):
    if keep is None:
        keep = []
    if filter_ev is None:
        filter_ev = []
    if experimental is None:
        experimental = []

    logger = logging.getLogger("pita")
    logger.debug(str(weight))
    try:
        db = AnnotationDb(conn=conn)

        # Filter repeats
        if repeats:
            for x in repeats:
                db.filter_repeats(chrom, x)

        for ev in filter_ev:
            db.filter_evidence(chrom, ev, experimental)

        mc = DbCollection(db, weight, prune=prune, chrom=chrom)

        # Remove short introns
        #mc.filter_short_introns()

        models = {}
        exons = {}
        logger.info("Calling transcripts for %s", chrom)
        for model in mc.get_best_variants(weight):
            genename = "{0}:{1}-{2}_".format(
                model[0].chrom,
                model[0].start,
                model[-1].end,
            )

            logger.info("Best model: %s with %s exons", genename, len(model))
            models[genename] = [genename, model]

            for exon in model:
                exons[str(exon)] = [exon, genename]

        discard = {}
        if prune:
            logger.debug("Prune: {0}".format(prune))
            overlap = get_overlapping_models([x[0] for x in exons.values()])
            if len(overlap) > 1:
                logger.info("%s overlapping exons", len(overlap))
#                logger.warn("Overlap: {0}".format(overlap))

            gene_count = {}
            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                gene_count[gene1] = gene_count.setdefault(gene1, 0) + 1
                gene_count[gene2] = gene_count.setdefault(gene2, 0) + 1

            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                if not (gene1 in discard or gene2 in discard):
                    m1 = models[gene1][1]
                    m2 = models[gene2][1]

                    loc1, loc2 = sorted(
                        [m1, m2], cmp=lambda x, y: cmp(x[0].start, y[0].start))
                    l1 = float(loc1[-1].end - loc1[0].start)
                    l2 = float(loc2[-1].end - loc2[0].start)
                    if loc2[-1].end > loc1[-1].end:
                        overlap = float(loc1[-1].end - loc2[0].start)
                    else:
                        overlap = l2

                    #logger.info("Pruning {} vs. {}".format(str(m1),str(m2)))
                    #logger.info("1: {}, 2: {}, overlap: {}".format(
                    #    l1, l2, overlap))
                    #logger.info("Gene {} count {}, gene {} count {}".format(
                    #    str(gene1), gene_count[gene1], str(gene2), gene_count[gene2]
                    #    ))
#
                    prune_overlap = prune["overlap"]["fraction"]
                    if overlap / l1 < prune_overlap and overlap / l2 < prune_overlap:
                        logger.debug(
                            "Not pruning because fraction of overlap is too small!"
                        )
                        continue

                    w1 = 0.0
                    w2 = 0.0
                    for d in prune["overlap"]["weights"]:
                        logger.debug("Pruning overlap: %s", d)
                        tmp_w1 = -mc.get_weight(m1)
                        tmp_w2 = -mc.get_weight(m2)
                        m = max((tmp_w1, tmp_w2))
                        if m > 0:
                            w1 += tmp_w1 / max((tmp_w1, tmp_w2))
                            w2 += tmp_w2 / max((tmp_w1, tmp_w2))

                    if w1 >= w2:
                        logger.info("Discarding %s", gene2)
                        discard[gene2] = 1
                    else:
                        logger.info("Discarding %s", gene1)
                        discard[gene1] = 1

        logger.info("Done calling transcripts for %s", chrom)
        result = [v for m, v in models.items() if not m in discard]
        #print "VV", result
        return [[name, [e.to_flat_exon() for e in exons]]
                for name, exons in result]

    except:
        logger.exception("Error on %s", chrom)

    return []
Пример #7
0
def test_db_collection(db):
    from pita.dbcollection import DbCollection
    c = DbCollection(db, [])

    for model in c.get_best_variants([]):
        print model
Пример #8
0
def test_db_collection(db):    
    from pita.dbcollection import DbCollection
    c = DbCollection(db, [])

    for model in c.get_best_variants([]):
        print model
Пример #9
0
def get_chrom_models(conn, chrom, weight, repeats=None, prune=None, keep=None, filter_ev=None, experimental=None):
    if keep is None:
        keep = []
    if filter_ev is None:
        filter_ev = []
    if experimental is None:
        experimental = []

    logger = logging.getLogger("pita")
    logger.debug(str(weight)) 
    try:
        db = AnnotationDb(conn=conn)
        
        # Filter repeats
        if repeats:
            for x in repeats:
                db.filter_repeats(chrom, x)

        for ev in filter_ev:
            db.filter_evidence(chrom, ev, experimental) 
        
        mc = DbCollection(db, weight, prune=prune, chrom=chrom)
       
        # Remove short introns
        #mc.filter_short_introns()
      
        models = {}
        exons = {}
        logger.info("Calling transcripts for %s", chrom)
        for model in mc.get_best_variants(weight):
            genename = "{0}:{1}-{2}_".format(
                                        model[0].chrom,
                                        model[0].start,
                                        model[-1].end,
                                        )
                   
                
            logger.info("Best model: %s with %s exons", 
                    genename, len(model))
            models[genename] = [genename, model]
            
            for exon in model:
                exons[str(exon)] = [exon, genename]

        discard = {}
        if prune:
            logger.debug("Prune: {0}".format(prune))
            overlap = get_overlapping_models([x[0] for x in exons.values()])
            if len(overlap) > 1:
                logger.info("%s overlapping exons", len(overlap))
#                logger.warn("Overlap: {0}".format(overlap))
                
            gene_count = {}
            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                gene_count[gene1] = gene_count.setdefault(gene1, 0) + 1
                gene_count[gene2] = gene_count.setdefault(gene2, 0) + 1

            for e1, e2 in overlap:
                gene1 = exons[str(e1)][1]
                gene2 = exons[str(e2)][1]
                if not(gene1 in discard or gene2 in discard):
                    m1 = models[gene1][1]
                    m2 = models[gene2][1]
                
                    loc1,loc2 = sorted([m1, m2], cmp=lambda x,y: cmp(x[0].start, y[0].start))
                    l1 = float(loc1[-1].end - loc1[0].start)
                    l2 = float(loc2[-1].end - loc2[0].start)
                    if loc2[-1].end > loc1[-1].end:
                        overlap = float(loc1[-1].end - loc2[0].start)
                    else:
                        overlap = l2

                    #logger.info("Pruning {} vs. {}".format(str(m1),str(m2)))
                    #logger.info("1: {}, 2: {}, overlap: {}".format(
                    #    l1, l2, overlap))
                    #logger.info("Gene {} count {}, gene {} count {}".format(
                    #    str(gene1), gene_count[gene1], str(gene2), gene_count[gene2]
                    #    ))
#                   
                    prune_overlap = prune["overlap"]["fraction"]
                    if overlap / l1 < prune_overlap and overlap / l2 < prune_overlap:
                        logger.debug("Not pruning because fraction of overlap is too small!")
                        continue
                    
                    w1 = 0.0
                    w2 = 0.0
                    for d in prune["overlap"]["weights"]:
                        logger.debug("Pruning overlap: %s", d)
                        tmp_w1 = -mc.get_weight(m1)
                        tmp_w2 = -mc.get_weight(m2)
                        m = max((tmp_w1, tmp_w2))
                        if m > 0:
                            w1 += tmp_w1 / max((tmp_w1, tmp_w2))
                            w2 += tmp_w2 / max((tmp_w1, tmp_w2))

                    if w1 >= w2:
                        logger.info("Discarding %s", gene2)
                        discard[gene2] = 1
                    else:
                        logger.info("Discarding %s", gene1)
                        discard[gene1] = 1
        
        logger.info("Done calling transcripts for %s", chrom)
        result = [v for m,v in models.items() if not m in discard]
        #print "VV", result
        return [[name, [e.to_flat_exon() for e in exons]] for name, exons in result]

    except:
        logger.exception("Error on %s", chrom)
  
    return []
Пример #10
0
def c(db):
    from pita.dbcollection import DbCollection
    c = DbCollection(db, [])
    return c
Пример #11
0
def test_db_collection(db):    
    from pita.dbcollection import DbCollection
    c = DbCollection(db)

    for model in c.get_connected_models():
        print model
Пример #12
0
def test_get_weight(db, bam_file, splice_file):
    db.get_read_statistics("scaffold_1", bam_file, "H3K4me3")
    db.get_splice_statistics("scaffold_1", splice_file, "RNAseq")
    from pita.dbcollection import DbCollection
    c = DbCollection(db)

    model = list(c.get_connected_models())[0][0]
    w = c.get_weight(model, "H3K4me3", "all")
    assert 365 == w
    w = c.get_weight(model, None, "length")
    assert 60100 == w
    w = c.get_weight(model, "H3K4me3", "rpkm")
    assert abs(1163.9 - w) < 0.1
    w = c.get_weight(model, "H3K4me3", "weighted")
    assert abs(0.01821963394342762 - w) < 0.0001
    w = c.get_weight(model, "H3K4me3", "total_rpkm")
    assert abs(4292.832 - w) < 0.1
    w = c.get_weight(model, "H3K4me3", "mean_exon")
    assert abs(1430.944 - w) < 0.1
    w = c.get_weight(model, "RNAseq", "splice")
    assert 24 == w
    w = c.get_weight(model, "H3K4me3", "first")
    assert 64 == w
    w = c.get_weight(model, None, "evidence")
    assert 1 == w