def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. toilDir = os.path.join(self.tempDir, "tmp_toil") outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups] ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups] runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupFragmentPath in outgroupFragmentPaths: system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = ingroupCoveragePaths[i] self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def run_ingroup_coverage(job, cactusWorkflowArguments, project): """ for every ingroup genome, make a bed file by computing its coverge vs the outgroups """ work_dir = job.fileStore.getLocalTempDir() exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] outgroups = [ job.fileStore.readGlobalFile(id) for id in cactusWorkflowArguments.outgroupFragmentIDs ] sequences = [ job.fileStore.readGlobalFile(id) for id in map(itemgetter(1), ingroupsAndOriginalIDs) ] cactusWorkflowArguments.totalSequenceSize = sum( os.stat(x).st_size for x in sequences) ingroups = map(itemgetter(0), ingroupsAndOriginalIDs) cigar = job.fileStore.readGlobalFile(cactusWorkflowArguments.alignmentsID) if len(outgroups) > 0: # should we parallelize with child jobs? for ingroup, sequence in zip(ingroups, sequences): coverage_path = os.path.join(work_dir, '{}.coverage'.format(sequence)) calculateCoverage(sequence, cigar, coverage_path, fromGenome=outgroups, work_dir=work_dir) cactusWorkflowArguments.ingroupCoverageIDs.append( job.fileStore.writeGlobalFile(coverage_path)) return cactusWorkflowArguments
def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. toilDir = os.path.join(self.tempDir, "tmp_toil") outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups] ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups] runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupFragmentPath in outgroupFragmentPaths: system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) calculateCoverage(fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = ingroupCoveragePaths[i] self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] # subselect 4 random ordered outgroups outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))] for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1,5): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegion = "ENm001" ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] MAX_NUM_OUTGROUPS = 3 # subselect a random set of outgroups in the same order outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), MAX_NUM_OUTGROUPS))] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1, len(outgroups) + 1): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def testProgressiveOutgroupsVsAllOutgroups(self): """Tests the difference in outgroup coverage on an ingroup when running in "ingroups vs. outgroups" mode and "set against set" mode. """ encodeRegion = "ENm001" ingroup = "human" outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa") outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run in "set against set" mode, aligning the entire ingroup # vs each outgroup runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile, toilDir=os.path.join(self.tempDir, "setVsSetToil"), chunkSize=500000, overlapSize=10000, targetSequenceFiles=outgroupPaths) # Run in "ingroup vs outgroups" mode, aligning the ingroup vs # the outgroups in order, trimming away sequence that's # already been aligned. runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join(self.tempDir, "outgroupToil")) # Get the coverage on the ingroup, in bases, from each run. coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered) coverageSetVsSet = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered)) coverageIngroupVsOutgroupsUnfiltered = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered) coverageIngroupVsOutgroups = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered)) print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet) print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups) # Make sure we're getting a reasonable fraction of the # alignments when using the trimming strategy. self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95) # Get the coverage on the ingroup, in bases, from just the # last outgroup. Obviously this should be much higher in set # vs set mode than in ingroup vs outgroup mode. outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments)) coverageFileSetVsSet = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet) coverageFromLastOutgroupSetVsSet = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileSetVsSet)) outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments)) coverageFileInVsOut = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut) coverageFromLastOutgroupInVsOut = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileInVsOut)) print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet) print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut) self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
def testProgressiveOutgroupsVsAllOutgroups(self): """Tests the difference in outgroup coverage on an ingroup when running in "ingroups vs. outgroups" mode and "set against set" mode. """ encodeRegion = "ENm001" ingroup = "human" outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa") outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run in "set against set" mode, aligning the entire ingroup # vs each outgroup runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile, toilDir=os.path.join(self.tempDir, "setVsSetToil"), chunkSize=500000, overlapSize=10000, targetSequenceFiles=outgroupPaths) # Run in "ingroup vs outgroups" mode, aligning the ingroup vs # the outgroups in order, trimming away sequence that's # already been aligned. runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join(self.tempDir, "outgroupToil")) # Get the coverage on the ingroup, in bases, from each run. coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir) coverageSetVsSetUnfilteredWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageSetVsSetUnfilteredWorkDir, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered) coverageSetVsSet = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered)) coverageIngroupVsOutgroupsUnfiltered = getTempFile(rootDir=self.tempDir) coverageIngroupsVsOutgroupsUnfilteredWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageIngroupsVsOutgroupsUnfilteredWorkDir, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered) coverageIngroupVsOutgroups = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered)) print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet) print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups) # Make sure we're getting a reasonable fraction of the # alignments when using the trimming strategy. self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95) # Get the coverage on the ingroup, in bases, from just the # last outgroup. Obviously this should be much higher in set # vs set mode than in ingroup vs outgroup mode. outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments)) coverageFileSetVsSet = getTempFile(rootDir=self.tempDir) coverageSetVsSetWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageSetVsSetWorkDir, sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet) coverageFromLastOutgroupSetVsSet = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileSetVsSet)) outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments)) coverageFileInVsOut = getTempFile(rootDir=self.tempDir) coverageInVsOutWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageInVsOutWorkDir, sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut) coverageFromLastOutgroupInVsOut = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileInVsOut)) print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet) print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut) self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)