def mergeBuh(conf, genes, parts1, parts2, blastfiles): """Merge by Best Unidirectional Hits""" # don't use this code without double checking it assert False lookup1 = item2part(parts1) lookup2 = item2part(parts2) best = util.Dict(dim=1, default=(0, None)) util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coverage = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if (blast.bitscore(hit) / float(blast.alignLength(hit)) < conf["bitspersite"] or coverage < conf["coverage"] or blast.evalue(hit) > conf["signif"]): continue #if blast.evalue(hit) > conf["signif"]: # continue if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) if score > best[part1][0]: best[part1] = (score, part2) if score > best[part2][0]: best[part2] = (score, part1) util.toc() util.toc() util.tic("determine clusters") sets = {} for gene in best: sets[gene] = UnionFind([gene]) for blastfile, order in blastfiles: util.tic("read hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coverage = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if (blast.bitscore(hit) / float(blast.alignLength(hit)) < conf["bitspersite"] or coverage < conf["coverage"] or blast.evalue(hit) > conf["signif"]): continue #if blast.evalue(hit) > conf["signif"]: # continue part1 = (0, lookup1[gene1]) part2 = (1, lookup2[gene2]) if score >= best[part1][0] * conf["relcutoff"]: sets[part1].union(sets[part2]) if score >= best[part2][0] * conf["relcutoff"]: sets[part2].union(sets[part1]) util.toc() sets = util.unique([x.root() for x in sets.values()]) parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set.members(): parts[-1].extend(joining[i][row]) util.toc() return parts
def mergeAvg(conf, genes, parts1, parts2, blastfiles, outblastfiles): lookup1 = item2part(parts1) lookup2 = item2part(parts2) # value is [sum, total] hits = util.Dict(dim=2, default=[0, 0]) if "accept" in conf: accept = conf["accept"] else: accept = False util.tic("read hits") for blastfile, order in blastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: gene1 = blast.query(hit) gene2 = blast.subject(hit) alnlen1 = blast.queryLength(hit) alnlen2 = blast.subjectLength(hit) else: gene2 = blast.query(hit) gene1 = blast.subject(hit) alnlen2 = blast.queryLength(hit) alnlen1 = blast.subjectLength(hit) score = blast.bitscore(hit) len1 = genes[gene1]["length"] len2 = genes[gene2]["length"] coveragesmall = min(alnlen1 / float(len1), alnlen2 / float(len2)) coveragebig = max(alnlen1 / float(len1), alnlen2 / float(len2)) # discard a hit that does not pass basic cutoffs if (blast.bitscore(hit) / float(blast.alignLength(hit)) < conf["bitspersite"] or coveragesmall < conf["coveragesmall"] or coveragebig < conf["coveragebig"] or blast.evalue(hit) > conf["signif"]): continue if (accept and (gene1 not in accept or gene2 not in accept)): continue # create a key for a partition: (side, index) if gene1 in lookup1: part1 = (0, lookup1[gene1]) else: parts1.append([gene1]) lookup1[gene1] = len(parts1) - 1 part1 = (0, len(parts1) - 1) if gene2 in lookup2: part2 = (1, lookup2[gene2]) else: parts2.append([gene2]) lookup2[gene2] = len(parts2) - 1 part2 = (1, len(parts2) - 1) val = hits[part1][part2] val[0] += score val[1] += 1 hits[part2][part1] = val util.toc() util.toc() util.tic("read outgroup hits") outbest = util.Dict(default=[0, 0]) for blastfile, order in outblastfiles: util.tic("determine best hits '%s'" % os.path.basename(blastfile)) for hit in blast.BlastReader(blastfile): if order: genein = blast.query(hit) #geneout = blast.subject(hit) else: #geneout = blast.query(hit) genein = blast.subject(hit) score = blast.bitscore(hit) # create a key for a partition: (side, index) if genein in lookup1: partin = (0, lookup1[genein]) elif gene1 in lookup2: partin = (1, lookup2[genein]) else: continue val = outbest[partin] val[0] += score val[1] += 1 util.toc() util.toc() assert len(parts1) == len(unionPart(parts1)) assert len(parts2) == len(unionPart(parts2)) util.tic("determine clusters") sets = {} for i in xrange(len(parts1)): sets[(0, i)] = sets.UnionFind([(0, i)]) for i in xrange(len(parts2)): sets[(1, i)] = sets.UnionFind([(1, i)]) # merge top avg hits for part1 in hits: o1 = outbest[part1] outavg1 = float(o1[0]) / max(o1[1], 1) top = 0 toppart = None for part2, (tot, num) in hits[part1].iteritems(): avg = float(tot) / num o2 = outbest[part2] outavg2 = float(o2[0]) / max(o2[1], 1) if avg > outavg1 and avg > outavg2 and avg > top: top = avg toppart = part2 if toppart: sets[part1].union(sets[toppart]) sets = util.unique([x.root() for x in sets.values()]) # create partition of genes parts = [] joining = (parts1, parts2) for set in sets: parts.append([]) for i, row in set: parts[-1].extend(joining[i][row]) util.toc() assert len(parts) == len(unionPart(parts)) return parts