Пример #1
0
def make_pep_colors(prop2color=prop2color):
    pep_colors = util.Dict(default=color(.5, .5, .5))

    AA = 'ARNDCEQGHILKMFPSTWYVU*'
    pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA))

    prop_counts = util.Dict(default=0)
    for char in AA:
        prop = seqlib.AA_PROPERTY[char]
        tint = prop_counts[prop] / float(pep_per_prop[prop])
        pep_colors[char] = prop2color(prop, tint * .5)
        prop_counts[prop] += 1

    return pep_colors
Пример #2
0
    def lookup(self, *keys, **options):
        """Returns a lookup dict based on a column 'key'
           or multiple keys
           
           extra options:
           default=None
           uselast=False    # allow multiple rows, just use last
        """

        options.setdefault("default", None)
        options.setdefault("uselast", False)
        lookup = util.Dict(dim=len(keys), default=options["default"])
        uselast = options["uselast"]

        for row in self:
            keys2 = util.mget(row, keys)
            ptr = lookup
            for i in xrange(len(keys2) - 1):
                ptr = lookup[keys2[i]]
            if not uselast and keys2[-1] in ptr:
                raise Exception("duplicate key '%s'" % str(keys2[-1]))
            ptr[keys2[-1]] = row

        lookup.insert = False
        return lookup
Пример #3
0
def findNeighbors(regiondb, genes):
    """Determine which genes in 'genes' are neighboring genes in the 
       chromosomes 'chroms'
       
       returns a clustering of the genes in clusters of neighboring streaks
    """

    geneset = set(genes)
    neighbors = util.Dict(dim=2)    
    
    for gene in genes:
        chrom = regiondb.species[gene.species][gene.seqname]
        ind = regionlib.findRegion(chrom, gene)
        
        # look for neighboring genes on same strand
        if ind > 0:
            left = chrom[ind-1]
            if left in geneset and left.strand == gene.strand:
                neighbors[gene][left] = 1
                neighbors[left][gene] = 1
        
        if ind < len(chrom)-1:
            right = chrom[ind+1]
            if right in geneset and right.strand == gene.strand:
                neighbors[gene][right] = 1
                neighbors[right][gene] = 1
    
    comps = graph.connectedComponents(genes, lambda x: neighbors[x].keys())
    
    # sort neighbors in order of appearance along strand
    for comp in comps:
        comp.sort(key=lambda x: x.start, 
                  reverse=(comp[0].strand == -1))
    
    return comps
Пример #4
0
def makeBlastFileLookup(blastfiles):
    lookup = util.Dict(dim=2)

    for f in blastfiles:
        m = util.match("(|.*/)(?P<genome1>[^/_]+)_(?P<genome2>[^/\.]+)\.[\/]*",
                       f)
        lookup[m["genome1"]][m["genome2"]] = (f, True)
        lookup[m["genome2"]][m["genome1"]] = (f, False)

    return lookup
Пример #5
0
def enrichItems(in_items, out_items, M=None, N=None, useq=True, extra=False):
    """Calculates enrichment for items within an in-set vs and out-set.
       Returns a sorted table.
    """
    # DEPRECATED
    # TODO: remove this function

    # count items
    counts = util.Dict(default=[0, 0])
    for item in in_items:
        counts[item][0] += 1
    for item in out_items:
        counts[item][1] += 1

    if N is None:
        N = len(in_items) + len(out_items)
    if M is None:
        M = len(in_items)

    tab = tablelib.Table(
        headers=["item", "in_count", "out_count", "pval", "pval_under"])

    # do hypergeometric
    for item, (a, b) in counts.iteritems():
        tab.add(item=item,
                in_count=a,
                out_count=b,
                pval=rhyper(a, a + b, M, N),
                pval_under=rhyper(a, a + b, M, N, 1))

    # add qvalues
    if useq:
        qval = qvalues(tab.cget("pval"))
        qval_under = qvalues(tab.cget("pval_under"))

        tab.add_col("qval", data=qval)
        tab.add_col("qval_under", data=qval_under)

    if extra:
        tab.add_col("in_size", data=[M] * len(tab))
        tab.add_col("out_size", data=[N - M] * len(tab))
        tab.add_col("item_ratio",
                    data=[
                        row["in_count"] /
                        float(row["in_count"] + row["out_count"])
                        for row in tab
                    ])
        tab.add_col("size_ratio", data=[M / float(N) for row in tab])
        tab.add_col(
            "fold",
            data=[row["item_ratio"] / row["size_ratio"] for row in tab])

    tab.sort(col='pval')
    return tab
Пример #6
0
def partLookup(parts1, parts2):
    """For each part in part1, which parts in parts2 share the same items"""

    lookup2 = util.Dict(default=-1)
    lookup2.update(item2part(parts2))
    splits = []

    for part1 in parts1:
        hits = set()
        for item in part1:
            hits.add(lookup2[item])
        splits.append(sorted(list(hits)))

    return splits
Пример #7
0
def confusionMatrix(parts1, parts2):
    """Returns a confusion matrix of two different partitions of the same 
       items"""

    confuse = util.Dict(dim=2, default=0)

    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    items1 = set(util.flatten(parts1, 1))
    items2 = set(util.flatten(parts2, 1))

    sameset = items1 & items2
    diffset = items1.symmetric_difference(items2)

    for item in sameset:
        confuse[lookup1[item]][lookup2[item]] += 1

    return confuse, list(diffset)
Пример #8
0
    def __init__(self,
                 regions1,
                 regions2,
                 hits,
                 hitnames=True,
                 style="line",
                 color=(0, 0, 0),
                 fill_color=None,
                 trace_color=(0, 1, 1, .2),
                 selfhits=True,
                 name=None):
        self.name = name
        self.regions1 = regions1
        self.regions2 = regions2
        self.style = style
        self.color = color
        self.fill_color = fill_color
        self.trace_color = trace_color
        self.selfhits = selfhits

        if hitnames:
            self.hits = []

            # resolve hits to regions
            name2region = util.Dict(default=[])
            for region in itertools.chain(self.regions1, self.regions2):
                name2region[region.data["ID"]].append(region)

            for hit in hits:
                newhit = []
                for name in hit:
                    if name in name2region:
                        newhit.extend(name2region[name])
                if len(newhit) > 0:
                    self.hits.append(newhit)

        else:
            self.hits = hits
Пример #9
0
def bestBidir(hits, scorefunc=bitscore):
    "find best bidirectional hits"

    best = util.Dict(default=[None, 0, None])

    for hit in hits:
        gene1 = query(hit)
        gene2 = subject(hit)
        score = scorefunc(hit)
        if score > best[gene1][1]:
            best[gene1] = [gene2, score, hit]
        if score > best[gene2][1]:
            best[gene2] = [gene1, score, hit]

    mark = set()
    hits2 = []
    for gene1, (gene2, score, hit) in best.iteritems():
        if best[gene2][0] == gene1 and gene1 not in mark:
            mark.add(gene1)
            mark.add(gene2)
            hits2.append(hit)

    return hits2
Пример #10
0
def block_bbh_hits(block):
    """Score a block by the number of BBH it contains"""

    # find all unidirectional best hits
    best = util.Dict(default=[-util.INF, None, None])

    for hit in block.data["hits"]:
        a, b, val = hit[:3]
        a = a.data["ID"]
        b = b.data["ID"]

        if val > best[a][0]:
            best[a] = (val, b, hit)
        if val > best[b][0]:
            best[b] = (val, a, hit)

    # count bi-directional best hits
    hits2 = []
    for a, (val, b, hit) in best.iteritems():
        if best[b][1] == a and a < b:
            hits2.append(hit)

    return hits2
Пример #11
0
    def __init__(self, master_file=None, seq2species=lambda x: x):
        self.lookup = util.Dict(default=[])
        self.seq2species = seq2species

        if master_file != None:
            self.read(master_file)
Пример #12
0

def visualize(mat,
              outfile,
              format="undirected",
              options="-Tjpg",
              param="overlap=\"false\";"):
    if format == "undirected":
        out = os.popen("neato " + options + " -o " + outfile, "w")
    elif format == "directed":
        out = os.popen("dot " + options + " -o " + outfile, "w")
    print out, format
    writeGraphviz(mat, out, format, param)


if __name__ == "__main__":
    mat = util.Dict(dim=2)

    mat[1][2] = 1
    mat[1][3] = 1
    mat[2][4] = 1
    mat[3][4] = 1
    mat[4][5] = 1

    #writeGraphviz(mat, sys.stdout, "directed")
    visualize(mat, "out2.jpg", "directed")


class GraphViz:
    pass
Пример #13
0
 def __init__(self, default=' '):
     self.mat = util.Dict(dim=2, default=default)
     self.default = default
Пример #14
0
    def layout_frags(self, genome_name, chrom_name, start, end, direction=1):

        ref_chrom = self.chroms_lookup[(genome_name, chrom_name)]

        # setup genome display order
        order = {}
        for i, genome in enumerate(self.genomes):
            order[genome] = i

        # swap the genome with order 0 and the reference genome
        j = order[self.ref_genome]
        order[self.genomes[0]] = j
        order[self.ref_genome] = 0

        # init reference fragment
        ref_frag = Frag(genome=genome_name,
                        chrom=chrom_name,
                        start=max(start, 0),
                        end=min(end, ref_chrom.end),
                        strand=direction,
                        x=max(start, 0),
                        y=0)
        self.frags.add(ref_frag)
        self.layout_frag_contents(ref_frag)

        # find all synteny blocks in this region
        # sort blocks by appearance in ref_chrom
        blocks = list(self.filter_blocks(self.blocks, ref_chrom, start, end))

        def blocksort(a):
            if a[1] == 0:
                starta = a[0].region1.start
            else:
                starta = a[0].region2.start

        blocks.sort(key=blocksort)

        # make lookup for genes to block and block to fragment
        block_lookup = {}
        frag_lookup = {}
        for block, flip in blocks:
            if flip == 0:
                other = block.region2
            else:
                other = block.region1

            frag = Frag()
            frag.genome = other.species
            frag.chrom = other.seqname
            frag_lookup[block] = frag

            for gene2 in iter_chrom(
                    self.db.get_regions(frag.genome, frag.chrom), other.start,
                    other.end):
                block_lookup[gene2] = block

        self.block_lookup = block_lookup

        # find all genes that will be drawn
        # walk along ref_chrom and store drawn genes into fragments
        refLookup = {}
        for gene in iter_chrom(self.db.get_regions(genome_name, chrom_name),
                               start, end):
            for name2 in self.orth_lookup.get(gene.data["ID"], []):
                gene2 = self.db.get_region(name2)
                if gene2 in block_lookup:
                    frag_lookup[block_lookup[gene2]].genes.append(gene2)
                    refLookup[gene2] = gene
        self.refLookup = refLookup

        # determine fragment dimensions
        for frag in frag_lookup.itervalues():
            if len(frag.genes) == 0:
                frag.x = None
                continue
            frag.genes.sort(key=lambda a: a.start)

            # set fragment start and end
            frag.start = frag.genes[0].start
            frag.end = frag.genes[-1].end

            # find fragment direction
            vote = 0
            last = None

            for gene2 in frag.genes:
                pos = refLookup[gene2].start

                if last != None and pos != last:
                    if last < pos:
                        vote += 1
                    else:
                        vote -= 1
                last = pos

            if vote > 0:
                frag.direction = direction
            else:
                frag.direction = -direction

            # find fragment x-coordinate
            diffs = []
            for gene2 in frag.genes:
                if direction == 1:
                    offset1 = refLookup[gene2].start - ref_frag.start
                else:
                    offset1 = ref_frag.end - refLookup[gene2].end

                if frag.direction == 1:
                    offset2 = gene2.start - frag.start
                else:
                    offset2 = frag.end - gene2.end
                diffs.append(offset2 - offset1)
            frag.x = ref_frag.x - stats.median(diffs)

        # place blocks
        fragY = util.Dict(default=-self.genome_sep)
        for block, flip in blocks:
            frag = frag_lookup[block]
            otherGenome = frag.genome

            if frag.x == None:
                # fragment could not be placed
                continue

            frag.y = fragY[otherGenome] - \
                     ((order[otherGenome] - 1) *
                       self.max_genome_sep)

            # re-get all genes between those coordinates
            #frag.genes = list(iter_chrom(self.db.get_regions(frag.genome,
            #                                                 frag.chrom),
            #                             frag.start, frag.end))

            # store and lyaout frag
            self.frags.add(frag)
            self.layout_frag_contents(frag)

            # stagger fragments
            fragY[otherGenome] -= self.frag_sep
            if fragY[otherGenome] < -self.max_genome_sep:
                fragY[otherGenome] = -self.genome_sep
Пример #15
0
def mergeBuh(conf, genes, parts1, parts2, blastfiles):
    """Merge by Best Unidirectional Hits"""

    # don't use this code without double checking it
    assert False

    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    best = util.Dict(dim=1, default=(0, None))

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue

            #if blast.evalue(hit) > conf["signif"]:
            #    continue

            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)

            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)

            if score > best[part1][0]:
                best[part1] = (score, part2)
            if score > best[part2][0]:
                best[part2] = (score, part1)
        util.toc()

    util.toc()

    util.tic("determine clusters")
    sets = {}
    for gene in best:
        sets[gene] = sets.UnionFind([gene])

    for blastfile, order in blastfiles:
        util.tic("read hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coverage = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coverage < conf["coverage"] or \
               blast.evalue(hit) > conf["signif"]:
                continue

            #if blast.evalue(hit) > conf["signif"]:
            #    continue

            part1 = (0, lookup1[gene1])
            part2 = (1, lookup2[gene2])

            if score >= best[part1][0] * conf["relcutoff"]:
                sets[part1].union(sets[part2])
            if score >= best[part2][0] * conf["relcutoff"]:
                sets[part2].union(sets[part1])
        util.toc()

    sets = util.unique([x.root() for x in sets.values()])

    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set.members():
            parts[-1].extend(joining[i][row])
    util.toc()

    return parts
Пример #16
0
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles):
    lookup1 = item2part(parts1)
    lookup2 = item2part(parts2)

    # value is [sum, total]
    hits = util.Dict(dim=2, default=[0, 0])

    if "accept" in conf:
        accept = conf["accept"]
    else:
        accept = False

    util.tic("read hits")
    for blastfile, order in blastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                gene1 = blast.query(hit)
                gene2 = blast.subject(hit)
                alnlen1 = blast.queryLength(hit)
                alnlen2 = blast.subjectLength(hit)
            else:
                gene2 = blast.query(hit)
                gene1 = blast.subject(hit)
                alnlen2 = blast.queryLength(hit)
                alnlen1 = blast.subjectLength(hit)
            score = blast.bitscore(hit)

            len1 = genes[gene1]["length"]
            len2 = genes[gene2]["length"]
            coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2))
            coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2))

            # discard a hit that does not pass basic cutoffs
            if blast.bitscore(hit) / float(blast.alignLength(hit)) < \
                   conf["bitspersite"] or \
               coveragesmall < conf["coveragesmall"] or \
               coveragebig < conf["coveragebig"] or \
               blast.evalue(hit) > conf["signif"]:
                continue


            if accept and \
               (gene1 not in accept or
                gene2 not in accept):
                continue

            # create a key for a partition: (side, index)
            if gene1 in lookup1:
                part1 = (0, lookup1[gene1])
            else:
                parts1.append([gene1])
                lookup1[gene1] = len(parts1) - 1
                part1 = (0, len(parts1) - 1)

            if gene2 in lookup2:
                part2 = (1, lookup2[gene2])
            else:
                parts2.append([gene2])
                lookup2[gene2] = len(parts2) - 1
                part2 = (1, len(parts2) - 1)

            val = hits[part1][part2]
            val[0] += score
            val[1] += 1
            hits[part2][part1] = val

        util.toc()
    util.toc()

    util.tic("read outgroup hits")
    outbest = util.Dict(default=[0, 0])
    for blastfile, order in outblastfiles:
        util.tic("determine best hits '%s'" % os.path.basename(blastfile))
        for hit in blast.BlastReader(blastfile):
            if order:
                genein = blast.query(hit)
                geneout = blast.subject(hit)
            else:
                geneout = blast.query(hit)
                genein = blast.subject(hit)
            score = blast.bitscore(hit)

            # create a key for a partition: (side, index)
            if genein in lookup1:
                partin = (0, lookup1[genein])
            elif gene1 in lookup2:
                partin = (1, lookup2[genein])
            else:
                continue

            val = outbest[partin]
            val[0] += score
            val[1] += 1

        util.toc()
    util.toc()

    assert len(parts1) == len(unionPart(parts1))
    assert len(parts2) == len(unionPart(parts2))

    util.tic("determine clusters")
    sets = {}
    for i in xrange(len(parts1)):
        sets[(0, i)] = sets.UnionFind([(0, i)])
    for i in xrange(len(parts2)):
        sets[(1, i)] = sets.UnionFind([(1, i)])

    # merge top avg hits
    for part1 in hits:
        o1 = outbest[part1]
        outavg1 = float(o1[0]) / max(o1[1], 1)

        top = 0
        toppart = None

        for part2, (tot, num) in hits[part1].iteritems():
            avg = float(tot) / num
            o2 = outbest[part2]
            outavg2 = float(o2[0]) / max(o2[1], 1)

            if avg > outavg1 and avg > outavg2 and avg > top:
                top = avg
                toppart = part2

        if toppart:
            sets[part1].union(sets[toppart])

    sets = util.unique([x.root() for x in sets.values()])

    # create partition of genes
    parts = []
    joining = (parts1, parts2)
    for set in sets:
        parts.append([])
        for i, row in set:
            parts[-1].extend(joining[i][row])
    util.toc()

    assert len(parts) == len(unionPart(parts))

    return parts
Пример #17
0
def find_synteny(species1, species2, regions1, regions2, orths):

    # ortholog db {gene1 -> orthologs of gene1}
    orthdb = util.Dict(default=set())
    for row in orths:
        orthdb[row[0]].add(row[1])
        orthdb[row[1]].add(row[0])


    # TODO: generalize
    # inparalogs db {gene1 -> gene1a | gene1 and gene1a both have same orthologs}
    inpardb = util.Dict(default=set())
    for gene, others in orthdb.iteritems():
        inpardb[gene] = orthdb[iter(others).next()]


    # make region db
    regiondb = regionlib.RegionDb(regions1 + regions2)

    # get chromosome sets
    chroms1 = regiondb.get_chroms(species1)
    chroms2 = regiondb.get_chroms(species2)


    blocks = []
    for chname1, chrom1 in chroms1.iteritems():
        # skip empty chromosomes
        if len(chrom1) == 0:
            continue
        
        # start a new block
        need_new_block = True
        loss_streak = []
        for i, gene1 in enumerate(chrom1):
            names2 = orthdb[gene1.data["ID"]]

            # no orthologs, start a loss streak
            if len(names2) == 0:
                #need_new_block = True
                loss_streak.append(make_orth(regiondb, [gene1.data["ID"]], []))
                continue

            # make ortholog cluster
            names1 = inpardb[gene1.data["ID"]]            
            orth = make_orth(regiondb, names1, names2)
            
            # orthologs are not contiguous, stop block
            if not is_orth_contig(regiondb, orth):
                loss_streak = []
                need_new_block = True
                continue

            # try to add to existing block
            if not need_new_block:
                block = blocks[-1]

                # just continue if we are still in the last ortholog pair
                #   i.e. gene1 is a paralog in a tandem set
                if orth == block.orths[-1]:
                    continue

                # try to append
                direction = can_append_orth(regiondb, block.orths[-1],
                                            block.dir, orth, orthdb)
                if direction == 0:
                    loss_streak = []
                    need_new_block = True
                else:
                    for loss in loss_streak:
                        block.add_orth(loss, direction)
                    loss_streak = []
                    block.add_orth(orth, direction)

            # start a new block
            if need_new_block:
                loss_streak = []
                if len(blocks) > 0:
                    blocks[-1].recalc_regions(regiondb)
                blocks.append(SyntenyBlock(* orth_regions(regiondb, orth)))
                blocks[-1].add_orth(orth)
                need_new_block = False

        if len(blocks) > 0:
            blocks[-1].recalc_regions(regiondb)

    return blocks
Пример #18
0
    pep_per_prop = util.hist_dict(util.mget(seqlib.AA_PROPERTY, AA))

    prop_counts = util.Dict(default=0)
    for char in AA:
        prop = seqlib.AA_PROPERTY[char]
        tint = prop_counts[prop] / float(pep_per_prop[prop])
        pep_colors[char] = prop2color(prop, tint * .5)
        prop_counts[prop] += 1

    return pep_colors


dna_colors = util.Dict(
    {
        "A": color(1, .5, .5),
        "T": color(1, 1, .5),
        "C": color(.5, 1, .5),
        "G": color(.5, .5, 1)
    },
    default=color(.5, .5, .5))

pep_colors = make_pep_colors(prop2color=prop2color)


def guess_seq(seq):
    """Guesses whether a sequence is 'dna' or 'pep'"""
    dna = "ACTG-N"

    chars = util.unique(seq.upper())

    for char in chars:
        if char not in dna: