Пример #1
0
 def validateTestSetResult(self, tsr):
     if (tsr.getFinalPositiveCount() != self.referencePositiveCount):
         raise pcssErrors.PcssGlobalException("Error: test set did not have same number of positives (%s) as the reference (%s)" % (tsr.getFinalPositiveCount(),
                                                                                                                                    self.referencePositiveCount))
     if (tsr.getFinalNegativeCount() != self.referenceNegativeCount):
         raise pcssErrors.PcssGlobalException("Error: test set did not have same number of negatives (%s) as the reference (%s)" % (tsr.getFinalNegativeCount(),
                                                                                                                                    self.referenceNegativeCount))
Пример #2
0
    def readAnnotationFile(self, annotationFile):
        if (not os.path.exists(annotationFile)):
            raise pcssErrors.PcssGlobalException(
                "Error: annotation file reader did not find expected annotation file\n%s"
                % annotationFile)
        reader = pcssTools.PcssFileReader(annotationFile)
        lines = reader.getLines()
        sortedAttributes = self.pcssRunner.pfa.getColumnSortedInputAttributes()

        for (i, line) in enumerate(lines):
            if (i == 0):
                self.validateColumnLine(annotationFile, line)

                continue
            pcssProtein = self.getProteinFromLine(line)
            if (not (pcssProtein.hasErrors())):
                cols = line.split('\t')
                for attribute in sortedAttributes:
                    attribute.setValueFromFile(
                        self.getValueForAttributeName(attribute.name, cols),
                        pcssProtein,
                        int(
                            self.getValueForAttributeName(
                                "peptide_start", cols)))
        self.setPeptideLength()
        if (len(self.proteins) == 0):
            raise pcssErrors.PcssGlobalException(
                "Did not read any proteins from annotation file")
Пример #3
0
    def validateCounts(self):
        if (self.testSetPositiveCount < 1):
            raise pcssErrors.PcssGlobalException("Positive test set count is %s (should be greater than 0)")
        if (self.testSetPositiveCount > self.testSetNegativeCount):
            raise pcssErrors.PcssGlobalException("Test set should have more negatives than positives")

        if (self.testSetPositiveCount + self.trainingSetPositiveCount != self.totalPositiveCount):
            raise pcssErrors.PcssGlobalException("Positive Training Set (%s) and Positive Test Set (%s) do not add up to total positives (%s)"
                                                 % (self.trainingSetPositiveCount, self.testSetPositiveCount, self.totalPositiveCount))
        if (self.testSetNegativeCount + self.trainingSetNegativeCount != self.totalNegativeCount):
            raise pcssErrors.PcssGlobalException("Negative Training Set (%s) and Negative Test Set (%s) do not add up to total negatives (%s)"
                                                 % (self.trainingSetNegativeCount, self.testSetNegativeCount, self.totalNegativeCount))
Пример #4
0
 def readResultFile(self):
     resultFile = self.getClassifyOutputFile()
     self.pstList = []
     if (not os.path.exists(resultFile)):
         raise pcssErrors.PcssGlobalException("Classify SVM could not read result file %s; \n"
                                              "check to make sure svm_classify completed as suggested" % resultFile)
     reader = pcssTools.PcssFileReader(self.getClassifyOutputFile())
     lines = reader.getLines()
     if (len(lines) != len(self.peptides)):
         raise pcssErrors.PcssGlobalException("Result file has a different number of results (%s) than I have peptides (%s)" % 
                                              (len(lines), len(self.peptides)))
     for (i, peptide) in enumerate(self.peptides):
         score = float(lines[i])
         pst = self.PeptideScoreTuple(peptide, score)
         self.pstList.append(pst)
Пример #5
0
    def validatePeptideSequences(self):
        for peptide in self.peptides.values():

            if (peptide.sequence != self.getSubsequence(peptide.startPosition, peptide.endPosition + 1)):
                raise pcssErrors.PcssGlobalException("Protein %s subsequence %s doesn't match peptide sequence %s starting at position %s" %
                                                     (self.modbaseSequenceId, self.getSubsequence(peptide.startPosition, peptide.endPosition + 1), 
                                                      peptide.sequence, peptide.startPosition))
Пример #6
0
    def seqBatchErrorExists(self, subDirName):
        if (os.path.exists(
                os.path.join(
                    subDirName,
                    self.pcssRunner.internalConfig["pcss_error_output_file"]))
            ):
            errorInfo = pcssErrors.ErrorInfo(
                os.path.join(
                    subDirName,
                    self.pcssRunner.internalConfig["pcss_error_output_file"]))
            raise pcssErrors.PcssGlobalException(
                "Got pcss seq batch error %s\nin directory %s" %
                (errorInfo.msg, subDirName))

        if (os.path.exists(
                os.path.join(
                    subDirName, self.pcssRunner.
                    internalConfig["internal_error_output_file"]))):
            errorInfo = pcssErrors.ErrorInfo(
                os.path.join(
                    subDirName, self.pcssRunner.
                    internalConfig["internal_error_output_file"]))
            raise InternalException(
                "Got internal seq batch error %s\nin directory %s" %
                (errorInfo.msg, subDirName))
Пример #7
0
 def getLength(self):
     targetEnd = int(self.getAttributeValue("target_end"))
     targetBegin = int(self.getAttributeValue("target_beg"))
     if (targetEnd <= targetBegin):
         raise pcssErrors.PcssGlobalException("Error in model table: model %s has target_end position %s "
                                              "before target_start position %s" % (self.getId(), targetEnd, targetBegin))
     return targetEnd - targetBegin 
Пример #8
0
    def trainAndApplyModel(self):
        
        self.trainingSvm.trainModel()

        if (len(self.testSvm.peptides) > 1):
            raise pcssErrors.PcssGlobalException("Error: Leave One Out Benchmarker has test set greater than size 1 (%s total)" % len(self.testSvm.peptides))
        
        self.testSvm.classifySvm()
Пример #9
0
 def finalizeFeature(self, peptide, feature, referencePeptideLength):
     if (peptide.getPeptideLength() > referencePeptideLength):
         raise pcssErrors.PcssGlobalException("Peptide %s has length of %s which is greater than reference %s" % (peptide.startPosition, 
                                                                                                                  peptide.getPeptideLength(),
                                                                                                                  referencePeptideLength()))
     lengthDifference = referencePeptideLength - peptide.getPeptideLength() 
     multiplier = feature.getFeatureLength()
     self.featureNumber += (lengthDifference * multiplier)
Пример #10
0
 def runSubprocess(self, args, checkStdError=True):
     """Run python subprocess module command; by default, raise exception if anything was written to stderr"""
     process = subprocess.Popen(args, shell=False, stderr=subprocess.PIPE)
     processOutput = process.communicate()
     if (processOutput[1] != "" and checkStdError):
         raise pcssErrors.PcssGlobalException(
             "Got subprocess error.\nRan method args %s\nGot stderr %s" %
             (args, processOutput[1]))
     return processOutput
Пример #11
0
    def validatePeptideTrainingStatus(self, status):

        status = status.lower()
        if (not (status == self.getPositiveKeyword()
                 or status == self.getNegativeKeyword())):
            raise pcssErrors.PcssGlobalException(
                "Peptide status %s not valid status (needs to be %s, %s)" %
                (status, self.getPositiveKeyword(), self.getNegativeKeyword()))
        return status
Пример #12
0
 def readBenchmarkFile(self, fileName):
     reader = pcssTools.PcssFileReader(fileName)
     lines = reader.getLines()
     firstLine = lines[0]
     lastLine = lines[-1]
     if (not self.checkBoundaryLines(firstLine, "0")):
         raise pcssErrors.PcssGlobalException("Expected benchmark file %s to have first line of 0\t0")
     if (not self.checkBoundaryLines(lastLine, "1")):
         raise pcssErrors.PcssGlobalException("Expected benchmark file %s to have last line of 1\t1")
     
     for line in lines[1:len(lines) - 2]:
         
         cols = line.split()
         fpr = float(cols[0])
         tpr = float(cols[1])
         score = float(cols[2])
         self.validateScore(score)
         st = self.ScoreTuple(fpr, tpr, score)
         self._results.append(st)
Пример #13
0
 def createModelStyle(self, runName, values, pdh):
     if (values['style'] == 'NewModelStyle'):
         return NewModelStyle(pdh)
     elif (values['style'] == 'OldModelStyle'):
         return OldModelStyle(pdh)
     else:
         raise pcssErrors.PcssGlobalException(
             "Model run info file has run %s with invalid model style %s; "
             "please change to either 'NewModelStyle' or 'OldModelStyle'" %
             (runName, values['style']))
Пример #14
0
 def sleepUntilDone(self, fileName, predicate):
     """Sleep until predicate involving fileName is true; useful for avoiding race conditions in file manipulation"""
     sleepTime = 0
     while (predicate(fileName)):
         print "sleep 1 second"
         time.sleep(1)
         sleepTime += 1
         if (sleepTime > 10):
             raise pcssErrors.PcssGlobalException("Timeout on file %s" %
                                                  fileName)
Пример #15
0
 def getSample(self, peptides, count):
     if (count > len(peptides)):
         raise pcssErrors.PcssGlobalException("getSample(): tried to sample %s peptides but there are only %s peptides in the pool" 
                                              % (count, len(peptides)))
     makeRandomSample = self.pcssRunner.internalConfig["make_random_test_set"]
     if (makeRandomSample):
         print "RANDOM SAMPLE"
         return random.sample(peptides, int(count))
     else:
         print "NON RANDOM SAMPLE" # -- make internal config interpolation and test
         return peptides[0:int(count)]
Пример #16
0
 def handleConfigError(self, results):
     msg = "CONFIGURATION ERROR\n"
     for (section_list, key, _) in flatten_errors(self.pcssConfig, results):
         if key is not None:
             msg += 'The "%s" key in the section "%s" failed validation\n' % (
                 key, ', '.join(section_list))
         else:
             msg += 'The following section was missing:%s ' % ', '.join(
                 section_list)
     print msg
     raise pcssErrors.PcssGlobalException(msg)
Пример #17
0
 def trainModel(self):
     svmCommandName = self.runner.internalConfig['svm_train_command']
     trainingSetFileName =  self.runner.pdh.getSvmTrainingSetFile()
     if (not os.path.exists(trainingSetFileName)):
         raise pcssErrors.PcssGlobalException("Did not find training set input file in expected location -- searched for\n%s" % trainingSetFileName)
     modelFileName = self.runner.pdh.getSvmNewModelFile()
     gammaFlag = self.runner.pcssConfig["svm_training_gamma"]
     cFlag = self.runner.pcssConfig["svm_training_c"]
     
     #SPLIT FLAGS
     svmOutput = self.runner.pdh.runSubprocess([svmCommandName, "-g", gammaFlag, "-c", cFlag, trainingSetFileName, modelFileName])
Пример #18
0
 def validateColumnLine(self, annotationFile, line):
     sortedAttributes = self.pcssRunner.pfa.getColumnSortedInputAttributes()
     firstAttribute = sortedAttributes[0]
     if (not line.startswith(firstAttribute.niceName)):
         raise pcssErrors.PcssGlobalException(
             "Error: read annotation file %s\n. Expected first row to be column header "
             "(starting with %s) but didn't find it; instead got\n%s" %
             (annotationFile, firstAttribute.niceName, line))
     columnNames = line.split('\t')
     sortedAttributeNames = []
     for i in sortedAttributes:
         sortedAttributeNames.append(i.niceName)
         if (i.niceName not in columnNames):
             raise pcssErrors.PcssGlobalException(
                 "Error: read annotation file %s\n. Expected input attribute %s but did not find it"
                 % (annotationFile, i.niceName))
     for i in columnNames:
         if i not in sortedAttributeNames:
             raise pcssErrors.PcssGlobalException(
                 "Error: read annotation file %s\n. Read column header %s that wasn't specified in attributes file"
                 % (annotationFile, i))
Пример #19
0
    def classifySvm(self):
        svmCommandName = self.pcssRunner.internalConfig['svm_classify_command']
        classificationFileName = self.getSvmInputFile()

        if (not os.path.exists(classificationFileName)):
            raise pcssErrors.PcssGlobalException("Did not find test set file in expected location -- searched for\n%s" % classificationFileName)

        scoreFileName = self.getClassifyOutputFile()

        modelFile = self.getSvmModelFile()

        svmOutput = self.pcssRunner.pdh.runSubprocess([svmCommandName, classificationFileName, modelFile, scoreFileName])
Пример #20
0
 def getSingleResidueFeatureList(self, residueCode, featureNumber, seqList):
     featureList = []
     foundResidue = False
     for (i, nextResidueCode) in enumerate(self.residueOrder):
         if (nextResidueCode == residueCode):
             foundResidue = True
             featureList.append("%s:%s" % (featureNumber + i, 1))
         else:
             featureList.append("%s:%s" % (featureNumber + i, 0))
     if (not foundResidue):
         raise pcssErrors.PcssGlobalException("Residue %s in sequence %s is not one of the 20 standard amino acids" 
                                              % (residueCode, "".join(seqList)))
     return " ".join(featureList)
Пример #21
0
    def getProteinValue(self, protein):
        """Get value of my attribute from input protein as a string"""

        attributeValue = protein.getAttributeOutputString(self.name)

        if (attributeValue is None):
            if (self.outputOptional is False):
                raise pcssErrors.PcssGlobalException(
                    "Protein %s never set mandatory attribute %s" %
                    (protein.modbaseSequenceId, self.name))
            else:
                return ""
        return attributeValue
Пример #22
0
    def unzipFile(self, sourceFile):
        """Unzip sourceFile; expects .gz suffix"""
        if (not sourceFile.endswith(".gz")):
            #might end up having to change this later to include other filetypes, but this could wreak havoc
            #if result file isn't properly formatted
            raise pcssErrors.PcssGlobalException(
                "Attempted to unzip file %s that does not end with '.gz'")
        resultFile = sourceFile.rstrip(".gz")
        if (not os.path.exists(resultFile)):
            #should never already exist since we wouldn't be here if it did, but another process could possibly have put it here
            #there is still somewhat of a race condition though as this check could return false and another process could put
            #the unzipped file in immdediately after, but chances are extremely low
            self.runSubprocess(['gunzip', sourceFile])

            self.sleepUntilDone(resultFile, predicate=self.fileDoesNotExist)
Пример #23
0
    def readProteinSequences(self, fastaFileName):
        fh = open(fastaFileName, 'r')
        fastaIterator = SeqIO.FastaIO.FastaIterator(fh)
        for seqRecord in fastaIterator:
            [modbaseId, uniprotId] = seqRecord.id.split('|')
            if (modbaseId in self.proteins):
                protein = self.proteins[modbaseId]
                protein.setProteinSequence(seqRecord.seq)

        for protein in self.proteins.values():
            if (protein.proteinSequence is None):
                raise pcssErrors.PcssGlobalException(
                    "Protein %s has no sequence set" %
                    protein.modbaseSequenceId)
        fh.close()
Пример #24
0
 def makePeptideFromCode(self, peptideCode, modbaseSeqId):
     if (len(peptideCode.split('_')) != 3):
         raise pcssErrors.PcssGlobalException(
             "Peptide code %s from protein %s is not proper form of peptideStart_peptideSequence_status"
             % (peptideCode, modbaseSeqId))
     [peptideStart, peptideSequence, status] = peptideCode.split('_')
     if (peptideSequence == self.pcssRunner.
             internalConfig["keyword_peptide_sequence_mismatch"]):
         return None
     status = self.pcssRunner.validatePeptideCodeStatus(status, peptideCode)
     peptideStart = int(peptideStart)
     peptide = pcssPeptide.PcssPeptide(
         peptideSequence, peptideStart,
         peptideStart + len(peptideSequence) - 1, self.pcssRunner)
     peptide.addStringAttribute("status", status)
     return peptide
Пример #25
0
 def createTrainingAndTestSets(self, peptides):
     trainingPeptideList = []
     if (self.currentPeptidePosition >= len(peptides)):
         msg = "Error: Leave one out benchmarker internal peptide counter (%s) must be smaller than input peptide set count (%s)" % (self.currentPeptidePosition,
                                                                                                                                     len(peptides))
         raise pcssErrors.PcssGlobalException(msg)
     for (i, peptide) in enumerate(peptides):
         if (i == self.currentPeptidePosition):
             self.testSvm.setPeptides([peptide])
             print "next test set peptide position %s status %s" % (peptide.startPosition, peptide.getAttributeOutputString("status"))
         else:
             trainingPeptideList.append(peptide)
     self.trainingSvm.setPeptides(trainingPeptideList)
     self.trainingSvm.writeTrainingSetFile()
     self.testSvm.writeClassificationFile()
     self.currentPeptidePosition += 1
Пример #26
0
    def initFromModelTableLine(self, line, modelTableColumns):
        """Initialize model from file.

        Use ModelTableColumns to get the names of each attribute in the line (which is read from the model table file)
        and save these attributes internally"""
        cols = line.split('\t')
        i = 0

        if (modelTableColumns.getColumnCount() != len(cols)):
            raise pcssErrors.PcssGlobalException("Model table column order file contains a "
                                                 "different number of columns (%s) than model table (%s)\nline: %s" % 
                                                 (modelTableColumns.getColumnCount(), len(cols), line))
        for col in cols:
            columnName = modelTableColumns.getColumnName(i)
            self.setAttribute(columnName, col)
            i += 1
Пример #27
0
    def prepareTrainingBenchmarkRun(self):
        pcssCopy = copy.deepcopy(self.pcssRunner.pcssConfig)
        print "running cluster with output file %s" % self.pcssRunner.pdh.getFullOutputFile(
            "")
        inputAnnotationFileName = self.pcssRunner.pdh.getFullOutputFile(
            self.pcssRunner.internalConfig["annotation_output_file"])
        if (not os.path.exists(inputAnnotationFileName)):
            msg = "Did not find input annotation file name %s\n" % inputAnnotationFileName
            msg += "Please make sure this is file is in the run directory for this training benchmark run"
            raise pcssErrors.PcssGlobalException(msg)

        pcssCopy[
            "input_annotation_file_name"] = self.pcssRunner.pdh.getFullOutputFile(
                self.pcssRunner.internalConfig["annotation_output_file"])
        pcssCopy.filename = self.pcssRunner.pdh.getFullOutputFile(
            self.pcssRunner.internalConfig["training_benchmark_config_file"])
        pcssCopy.write()
Пример #28
0
 def makeSvmFileLine(self):
     svmFileStringList = []
     featureNumber = 0
     svmHandler = pcssSvm.SvmFeatureHandler()
     self.svmHandler = svmHandler
     for featureName in self.pcssRunner.getSvmFeatureOrder():
         if (not self.hasAttribute(featureName)):
             raise pcssErrors.PcssGlobalException("Error: peptide tried to make svm feature for %s but does not have this feature" % featureName)
         
         if (not self.getAttribute(featureName).isInitialized() or pcssTools.isPeptideErrorValue(self.getAttributeOutputString(featureName))):
             svmHandler.processEmptyFeature(self.pcssRunner.getPeptideLength(), self.getAttribute(featureName))
             
         else:
             svmFileStringList.append(self.getAttribute(featureName).makeSvmFeature(svmHandler))
         svmHandler.finalizeFeature(self, self.getAttribute(featureName), self.pcssRunner.getPeptideLength())
     return " ".join(svmFileStringList)
                                  
Пример #29
0
    def getTrainingBenchmarkConfig(self):
        baseConfig = copy.deepcopy(self.pcssConfig)
        baseConfig["pcss_directory"] = self.getPcssClusterBaseDirectory()
        baseConfig["run_directory"] = self.getClusterRunDirectory()

        inputAnnotationFileName = self.getFullOutputFile(
            self.internalConfig["annotation_output_file"])
        if (not os.path.exists(inputAnnotationFileName)):
            msg = "Did not find input annotation file name %s\n" % inputAnnotationFileName
            msg += "Please make sure this is file is in the run directory for this training benchmark run"
            raise pcssErrors.PcssGlobalException(msg)

        baseConfig[
            "input_annotation_file_name"] = self.getFullClusterOutputFile(
                self.internalConfig["annotation_output_file"])

        return baseConfig
Пример #30
0
    def getPeptideValue(self, peptide):
        """Get value of my attribute from input peptide as a string"""
        if (self.attributeType == 'model'):

            if (peptide.bestModel is None):
                return ""
            else:
                return peptide.bestModel.getAttributeValue(self.name)
        else:
            attributeValue = peptide.getAttributeOutputString(self.name)
            if (attributeValue is None):
                if (self.outputOptional is False):
                    raise pcssErrors.PcssGlobalException(
                        "Peptide %s never set mandatory attribute %s" %
                        (peptide.startPosition, self.name))
                else:
                    return ""
            return attributeValue