Пример #1
0
 def testDepth(self):
     output = StringIO()
     trimSequences(self.faPath,
                   self.bedPath,
                   output,
                   flanking=0,
                   minSize=0,
                   windowSize=1,
                   depth=2)
     self.assertTrue(">seq1|0" not in output.getvalue())
     self.assertTrue(">seq1|6" in output.getvalue())
     self.assertTrue(">seq1|15" in output.getvalue())
Пример #2
0
 def testMinSize(self):
     output = StringIO()
     trimSequences(self.faPath,
                   self.bedPath,
                   output,
                   flanking=0,
                   minSize=2,
                   windowSize=1,
                   threshold=1)
     self.assertTrue(">seq1|0" in output.getvalue())
     self.assertTrue(">seq1|6" in output.getvalue())
     self.assertTrue(">seq1|15" not in output.getvalue())
Пример #3
0
 def testFlanking(self):
     output = StringIO()
     trimSequences(self.faPath, self.bedPath, output, flanking=1, minSize=0, windowSize=1, threshold=1)
     # The two blocks 0-5, 6-11 should be merged together since
     # their flanking sequence intersects. Additionally the
     # flanking sequence shouldn't go past the beginning sequence.
     self.assertTrue(dedent('''\
     >seq1|0
     CATGCATGCATG''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|14
     TGC''') in output.getvalue())
Пример #4
0
 def testWithBlankLines(self):
     output = StringIO()
     with open(self.faPath, 'a') as f:
         f.write("\n\n\n")
     trimSequences(self.faPath, self.bedPath, output, flanking=0, minSize=0, windowSize=1, threshold=1)
     self.assertTrue(dedent('''\
     >seq1|0
     CATGC''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|6
     TGCAT''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|15
     G''') in output.getvalue())
Пример #5
0
 def testComplement(self):
     output = StringIO()
     trimSequences(self.faPath, self.bedPath, output, flanking=0, minSize=0, windowSize=1, threshold=1,
                   complement=True)
     self.assertTrue(dedent('''\
     >seq1|5
     A''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|11''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|16''') in output.getvalue())
     # make sure the sequence that isn't covered at all is included
     self.assertTrue(dedent('''\
     >seq2|0''') in output.getvalue())
Пример #6
0
 def testSimplestParameters(self):
     # Test w/ no windowing, minimum size, etc to see if bed
     # import/fasta export works
     output = StringIO()
     trimSequences(self.faPath, self.bedPath, output, flanking=0, minSize=0, windowSize=1, threshold=1)
     self.assertTrue(dedent('''\
     >seq1|0
     CATGC''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|6
     TGCAT''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|15
     G''') in output.getvalue())
Пример #7
0
 def testFlanking(self):
     output = StringIO()
     trimSequences(self.faPath,
                   self.bedPath,
                   output,
                   flanking=1,
                   minSize=0,
                   windowSize=1,
                   threshold=1)
     # The two blocks 0-5, 6-11 should be merged together since
     # their flanking sequence intersects. Additionally the
     # flanking sequence shouldn't go past the beginning sequence.
     self.assertTrue(
         dedent('''\
     >seq1|0
     CATGCATGCATG''') in output.getvalue())
     self.assertTrue(
         dedent('''\
     >seq1|14
     TGC''') in output.getvalue())
Пример #8
0
 def testComplement(self):
     output = StringIO()
     trimSequences(self.faPath,
                   self.bedPath,
                   output,
                   flanking=0,
                   minSize=0,
                   windowSize=1,
                   threshold=1,
                   complement=True)
     self.assertTrue(
         dedent('''\
     >seq1|5
     A''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|11''') in output.getvalue())
     self.assertTrue(dedent('''\
     >seq1|16''') in output.getvalue())
     # make sure the sequence that isn't covered at all is included
     self.assertTrue(dedent('''\
     >seq2|0''') in output.getvalue())
Пример #9
0
 def testSimplestParameters(self):
     # Test w/ no windowing, minimum size, etc to see if bed
     # import/fasta export works
     output = StringIO()
     trimSequences(self.faPath,
                   self.bedPath,
                   output,
                   flanking=0,
                   minSize=0,
                   windowSize=1,
                   threshold=1)
     self.assertTrue(
         dedent('''\
     >seq1|0
     CATGC''') in output.getvalue())
     self.assertTrue(
         dedent('''\
     >seq1|6
     TGCAT''') in output.getvalue())
     self.assertTrue(
         dedent('''\
     >seq1|15
     G''') in output.getvalue())
Пример #10
0
 def testWithBlankLines(self):
     output = StringIO()
     with open(self.faPath, 'a') as f:
         f.write("\n\n\n")
     trimSequences(self.faPath,
                   self.bedPath,
                   output,
                   flanking=0,
                   minSize=0,
                   windowSize=1,
                   threshold=1)
     self.assertTrue(
         dedent('''\
     >seq1|0
     CATGC''') in output.getvalue())
     self.assertTrue(
         dedent('''\
     >seq1|6
     TGCAT''') in output.getvalue())
     self.assertTrue(
         dedent('''\
     >seq1|15
     G''') in output.getvalue())
Пример #11
0
    def run(self, fileStore):
        # Trim outgroup, convert outgroup coordinates, and add to
        # outgroup fragments dir

        outgroupSequenceFiles = [
            fileStore.readGlobalFile(fileID)
            for fileID in self.outgroupSequenceIDs
        ]
        mostRecentResultsFile = fileStore.readGlobalFile(
            self.mostRecentResultsID)
        trimmedOutgroup = fileStore.getLocalTempFile()
        outgroupCoverage = fileStore.getLocalTempFile()
        calculateCoverage(outgroupSequenceFiles[0], mostRecentResultsFile,
                          outgroupCoverage)
        # The windowSize and threshold are fixed at 1: anything more
        # and we will run into problems with alignments that aren't
        # covered in a matching trimmed sequence.
        trimSequences(outgroupSequenceFiles[0],
                      outgroupCoverage,
                      trimmedOutgroup,
                      flanking=self.blastOptions.trimOutgroupFlanking,
                      windowSize=1,
                      threshold=1)
        outgroupConvertedResultsFile = fileStore.getLocalTempFile()
        with open(outgroupConvertedResultsFile, 'w') as f:
            upconvertCoords(cigarPath=mostRecentResultsFile,
                            fastaPath=trimmedOutgroup,
                            contigNum=1,
                            outputFile=f)

        self.outgroupFragmentIDs.append(
            fileStore.writeGlobalFile(trimmedOutgroup))
        sequenceFiles = [
            fileStore.readGlobalFile(path) for path in self.sequenceIDs
        ]
        untrimmedSequenceFiles = [
            fileStore.readGlobalFile(path)
            for path in self.untrimmedSequenceIDs
        ]

        # Report coverage of the latest outgroup on the trimmed ingroups.
        for trimmedIngroupSequence, ingroupSequence, ingroupName in zip(
                sequenceFiles, untrimmedSequenceFiles, self.ingroupNames):
            tmpIngroupCoverage = fileStore.getLocalTempFile()
            calculateCoverage(trimmedIngroupSequence, mostRecentResultsFile,
                              tmpIngroupCoverage)
            fileStore.logToMaster(
                "Coverage on %s from outgroup #%d, %s: %s%% (current ingroup length %d, untrimmed length %d). Outgroup trimmed to %d bp from %d"
                % (ingroupName, self.outgroupNumber,
                   self.outgroupNames[self.outgroupNumber - 1],
                   percentCoverage(trimmedIngroupSequence, tmpIngroupCoverage),
                   sequenceLength(trimmedIngroupSequence),
                   sequenceLength(ingroupSequence),
                   sequenceLength(trimmedOutgroup),
                   sequenceLength(outgroupSequenceFiles[0])))

        # Convert the alignments' ingroup coordinates.
        ingroupConvertedResultsFile = fileStore.getLocalTempFile()
        if self.sequenceIDs == self.untrimmedSequenceIDs:
            # No need to convert ingroup coordinates on first run.
            shutil.copy(outgroupConvertedResultsFile,
                        ingroupConvertedResultsFile)
        else:
            cactus_call(parameters=[
                "cactus_blast_convertCoordinates", "--onlyContig1",
                outgroupConvertedResultsFile, ingroupConvertedResultsFile, "1"
            ])
        # Append the latest results to the accumulated outgroup coverage file
        if self.outgroupResultsID:
            outgroupResultsFile = fileStore.readGlobalFile(
                self.outgroupResultsID, mutable=True)
        else:
            outgroupResultsFile = fileStore.getLocalTempFile()
        with open(ingroupConvertedResultsFile) as results:
            with open(outgroupResultsFile, 'a') as output:
                output.write(results.read())

        self.outgroupResultsID = fileStore.writeGlobalFile(outgroupResultsFile)

        # Report coverage of the all outgroup alignments so far on the ingroups.
        ingroupCoverageFiles = []
        self.ingroupCoverageIDs = []
        for ingroupSequence, ingroupName in zip(untrimmedSequenceFiles,
                                                self.ingroupNames):
            ingroupCoverageFile = fileStore.getLocalTempFile()
            calculateCoverage(
                sequenceFile=ingroupSequence,
                cigarFile=outgroupResultsFile,
                outputFile=ingroupCoverageFile,
                depthById=self.blastOptions.trimOutgroupDepth > 1)
            ingroupCoverageFiles.append(ingroupCoverageFile)
            self.ingroupCoverageIDs.append(
                fileStore.writeGlobalFile(ingroupCoverageFile))
            fileStore.logToMaster(
                "Cumulative coverage of %d outgroups on ingroup %s: %s" %
                (self.outgroupNumber, ingroupName,
                 percentCoverage(ingroupSequence, ingroupCoverageFile)))

        if len(self.outgroupSequenceIDs) > 1:
            # Trim ingroup seqs and recurse on the next outgroup.
            trimmedSeqs = []
            # Use the accumulated results so far to trim away the
            # aligned parts of the ingroups.
            for i, sequenceFile in enumerate(untrimmedSequenceFiles):
                outgroupCoverageFile = ingroupCoverageFiles[i]
                selfCoverageFile = fileStore.getLocalTempFile()
                coverageFile = fileStore.getLocalTempFile()
                if self.blastOptions.keepParalogs:
                    subtractBed(outgroupCoverageFile, selfCoverageFile,
                                coverageFile)
                else:
                    coverageFile = outgroupCoverageFile

                trimmed = fileStore.getLocalTempFile()
                trimSequences(sequenceFile,
                              coverageFile,
                              trimmed,
                              complement=True,
                              flanking=self.blastOptions.trimFlanking,
                              minSize=self.blastOptions.trimMinSize,
                              threshold=self.blastOptions.trimThreshold,
                              windowSize=self.blastOptions.trimWindowSize,
                              depth=self.blastOptions.trimOutgroupDepth)
                trimmedSeqs.append(trimmed)
            trimmedSeqIDs = [
                fileStore.writeGlobalFile(path, cleanup=True)
                for path in trimmedSeqs
            ]
            return self.addChild(
                BlastFirstOutgroup(
                    ingroupNames=self.ingroupNames,
                    untrimmedSequenceIDs=self.untrimmedSequenceIDs,
                    sequenceIDs=trimmedSeqIDs,
                    outgroupNames=self.outgroupNames,
                    outgroupSequenceIDs=self.outgroupSequenceIDs[1:],
                    outgroupFragmentIDs=self.outgroupFragmentIDs,
                    outgroupResultsID=self.outgroupResultsID,
                    blastOptions=self.blastOptions,
                    outgroupNumber=self.outgroupNumber + 1,
                    ingroupCoverageIDs=self.ingroupCoverageIDs)).rv()
        else:
            # Finally, put the ingroups and outgroups results together
            return (self.outgroupResultsID, self.outgroupFragmentIDs,
                    self.ingroupCoverageIDs)
Пример #12
0
 def testMinSize(self):
     output = StringIO()
     trimSequences(self.faPath, self.bedPath, output, flanking=0, minSize=2, windowSize=1, threshold=1)
     self.assertTrue(">seq1|0" in output.getvalue())
     self.assertTrue(">seq1|6" in output.getvalue())
     self.assertTrue(">seq1|15" not in output.getvalue())
Пример #13
0
 def testDepth(self):
     output = StringIO()
     trimSequences(self.faPath, self.bedPath, output, flanking=0, minSize=0, windowSize=1, depth=2)
     self.assertTrue(">seq1|0" not in output.getvalue())
     self.assertTrue(">seq1|6" in output.getvalue())
     self.assertTrue(">seq1|15" in output.getvalue())
Пример #14
0
    def run(self, fileStore):
        # Trim outgroup, convert outgroup coordinates, and add to
        # outgroup fragments dir

        outgroupSequenceFiles = [fileStore.readGlobalFile(fileID) for fileID in self.outgroupSequenceIDs]
        mostRecentResultsFile = fileStore.readGlobalFile(self.mostRecentResultsID)
        trimmedOutgroup = fileStore.getLocalTempFile()
        outgroupCoverage = fileStore.getLocalTempFile()
        calculateCoverage(outgroupSequenceFiles[0],
                          mostRecentResultsFile, outgroupCoverage)
        # The windowSize and threshold are fixed at 1: anything more
        # and we will run into problems with alignments that aren't
        # covered in a matching trimmed sequence.
        trimSequences(outgroupSequenceFiles[0], outgroupCoverage,
                      trimmedOutgroup, flanking=self.blastOptions.trimOutgroupFlanking,
                      windowSize=1, threshold=1)
        outgroupConvertedResultsFile = fileStore.getLocalTempFile()
        with open(outgroupConvertedResultsFile, 'w') as f:
            upconvertCoords(cigarPath=mostRecentResultsFile,
                            fastaPath=trimmedOutgroup,
                            contigNum=1,
                            outputFile=f)

        self.outgroupFragmentIDs.append(fileStore.writeGlobalFile(trimmedOutgroup))
        sequenceFiles = [fileStore.readGlobalFile(path) for path in self.sequenceIDs]
        untrimmedSequenceFiles = [fileStore.readGlobalFile(path) for path in self.untrimmedSequenceIDs]

        # Report coverage of the latest outgroup on the trimmed ingroups.
        for trimmedIngroupSequence, ingroupSequence, ingroupName in zip(sequenceFiles, untrimmedSequenceFiles, self.ingroupNames):
            tmpIngroupCoverage = fileStore.getLocalTempFile()
            calculateCoverage(trimmedIngroupSequence, mostRecentResultsFile,
                              tmpIngroupCoverage)
            fileStore.logToMaster("Coverage on %s from outgroup #%d, %s: %s%% (current ingroup length %d, untrimmed length %d). Outgroup trimmed to %d bp from %d" % (ingroupName, self.outgroupNumber, self.outgroupNames[self.outgroupNumber - 1], percentCoverage(trimmedIngroupSequence, tmpIngroupCoverage), sequenceLength(trimmedIngroupSequence), sequenceLength(ingroupSequence), sequenceLength(trimmedOutgroup), sequenceLength(outgroupSequenceFiles[0])))

        # Convert the alignments' ingroup coordinates.
        ingroupConvertedResultsFile = fileStore.getLocalTempFile()
        if self.sequenceIDs == self.untrimmedSequenceIDs:
            # No need to convert ingroup coordinates on first run.
            shutil.copy(outgroupConvertedResultsFile,
                        ingroupConvertedResultsFile)
        else:
            cactus_call(parameters=["cactus_blast_convertCoordinates",
                                    "--onlyContig1",
                                    outgroupConvertedResultsFile,
                                    ingroupConvertedResultsFile,
                                    "1"])
        # Append the latest results to the accumulated outgroup coverage file
        if self.outgroupResultsID:
            outgroupResultsFile = fileStore.readGlobalFile(self.outgroupResultsID, mutable=True)
        else:
            outgroupResultsFile = fileStore.getLocalTempFile()
        with open(ingroupConvertedResultsFile) as results:
            with open(outgroupResultsFile, 'a') as output:
                output.write(results.read())

        self.outgroupResultsID = fileStore.writeGlobalFile(outgroupResultsFile)

        # Report coverage of the all outgroup alignments so far on the ingroups.
        ingroupCoverageFiles = []
        self.ingroupCoverageIDs = []
        for ingroupSequence, ingroupName in zip(untrimmedSequenceFiles, self.ingroupNames):
            ingroupCoverageFile = fileStore.getLocalTempFile()
            calculateCoverage(sequenceFile=ingroupSequence, cigarFile=outgroupResultsFile,
                              outputFile=ingroupCoverageFile, depthById=self.blastOptions.trimOutgroupDepth > 1)
            ingroupCoverageFiles.append(ingroupCoverageFile)
            self.ingroupCoverageIDs.append(fileStore.writeGlobalFile(ingroupCoverageFile))
            fileStore.logToMaster("Cumulative coverage of %d outgroups on ingroup %s: %s" % (self.outgroupNumber, ingroupName, percentCoverage(ingroupSequence, ingroupCoverageFile)))

        if len(self.outgroupSequenceIDs) > 1:
            # Trim ingroup seqs and recurse on the next outgroup.
            trimmedSeqs = []
            # Use the accumulated results so far to trim away the
            # aligned parts of the ingroups.
            for i, sequenceFile in enumerate(untrimmedSequenceFiles):
                outgroupCoverageFile = ingroupCoverageFiles[i]
                selfCoverageFile = fileStore.getLocalTempFile()
                coverageFile = fileStore.getLocalTempFile()
                if self.blastOptions.keepParalogs:
                    subtractBed(outgroupCoverageFile, selfCoverageFile, coverageFile)
                else:
                    coverageFile = outgroupCoverageFile

                trimmed = fileStore.getLocalTempFile()
                trimSequences(sequenceFile, coverageFile, trimmed,
                              complement=True, flanking=self.blastOptions.trimFlanking,
                              minSize=self.blastOptions.trimMinSize,
                              threshold=self.blastOptions.trimThreshold,
                              windowSize=self.blastOptions.trimWindowSize,
                              depth=self.blastOptions.trimOutgroupDepth)
                trimmedSeqs.append(trimmed)
            trimmedSeqIDs = [fileStore.writeGlobalFile(path, cleanup=True) for path in trimmedSeqs]
            return self.addChild(BlastFirstOutgroup(
                ingroupNames=self.ingroupNames,
                untrimmedSequenceIDs=self.untrimmedSequenceIDs,
                sequenceIDs=trimmedSeqIDs,
                outgroupNames=self.outgroupNames,
                outgroupSequenceIDs=self.outgroupSequenceIDs[1:],
                outgroupFragmentIDs=self.outgroupFragmentIDs,
                outgroupResultsID=self.outgroupResultsID,
                blastOptions=self.blastOptions,
                outgroupNumber=self.outgroupNumber + 1,
                ingroupCoverageIDs=self.ingroupCoverageIDs)).rv()
        else:
            # Finally, put the ingroups and outgroups results together
            return (self.outgroupResultsID, self.outgroupFragmentIDs, self.ingroupCoverageIDs)