def ParseFeatureFile(self, FilePath, FeatureSet2, FeatureSet3, DBRatio): """ Initialize the FeatureSet object, by parsing features from the specified FilePath. Facultative features go to FeatureSetF, constitutive features go to FeatureSetC """ File = open(FilePath, "rb") # Parse the header line specially: HeaderLine = File.readline() self.HeaderLines.append(HeaderLine) Bits = HeaderLine.strip().split("\t") for BitIndex in range(len(Bits)): if BitIndex >= FormatBits.FirstFeature: self.FeatureNames[BitIndex - FormatBits.FirstFeature] = Bits[BitIndex] #if BitIndex <= FormatBits.LastFeature: # print "Feature %s: %s"%(BitIndex - FormatBits.FirstFeature, Bits[BitIndex]) # Iterate over non-header lines: LineNumber = 0 for FileLine in File.xreadlines(): LineNumber += 1 if FileLine[0] == "#": self.HeaderLines.append(FileLine) continue # skip comment line if not FileLine.strip(): continue # skip blank line Bits = FileLine.replace("\r","").replace("\n","").split("\t") # If there are TOO MANY bits, then discard the extras: Bits = Bits[:FormatBits.LastFeature + 1] try: TrueFlag = int(Bits[FormatBits.TrueProteinFlag]) except: continue # skip; not a valid instance line Charge = int(Bits[FormatBits.Charge]) SisterAnnotation = Bits[FormatBits.SisterAnnotationFlag] Vector = Learning.FeatureVector() if Charge > 2: FeatureSet = FeatureSet3 else: FeatureSet = FeatureSet2 try: for FeatureBitIndex in range(FormatBits.FirstFeature, FormatBits.LastFeature + 1): FeatureIndex = FeatureBitIndex - FormatBits.FirstFeature #if FeatureIndex not in self.FeaturesAll: # continue if FeatureBitIndex < len(Bits) and Bits[FeatureBitIndex].strip() and Bits[FeatureBitIndex] != "None": Vector.Features.append(float(Bits[FeatureBitIndex])) else: Vector.Features.append(0) Vector.FileBits = Bits Vector.TrueFlag = TrueFlag if TrueFlag: FeatureSet.TrueVectors.append(Vector) else: FeatureSet.FalseVectors.append(Vector) FeatureSet.AllVectors.append(Vector) except: traceback.print_exc() print "** Error on line %s column %s of feature file"%(LineNumber, FeatureIndex) print Bits File.close() # Initialize counts: for FeatureSet in (FeatureSet2, FeatureSet3): FeatureSet.SetCounts() FeatureSet.GetPriorProbabilityFalse(DBRatio) print "CHARGE 1,2: Read in %s true and %s false vectors"%(FeatureSet2.TrueCount, FeatureSet2.FalseCount) print "CHARGE 3+: Read in %s true and %s false vectors"%(FeatureSet3.TrueCount, FeatureSet3.FalseCount)
def TrainFacultative(self): """ Train paired models for CONSTITUTIVE ("always") and FACULTATIVE ("sometimes") PTMs. """ # Train a model on all PTMs, to get initial scores for all PTMs. # The initial model uses only CONSTITUTIVE features, and its output # is used only to provide an ORACLE for the facultative model: print "TRAIN model on all features:" self.Model.Train(self.TrainingSetAll) print "SCORE all features:" self.Model.Test(self.TrainingSetAll) ############################################################## print "Generate SUB-MODEL of only facultative features:" # Sort facultative instances by score: SortedList = [] for Vector in self.TrainingSetAll.AllVectors: if not Vector.FileBits[FormatBits.SisterAnnotationFlag]: continue SortedList.append((Vector.Score, Vector)) SortedList.sort() FacFeatureSet = Learning.FeatureSetClass() ChunkSize = min(len(SortedList) / 4, 1000) print "Sorted list of %s facultative features, chunk size is %s"%(len(SortedList), ChunkSize) for (Score, Vector) in SortedList[:ChunkSize]: NewVector = Learning.FeatureVector() NewVector.FileBits = Vector.FileBits[:] NewVector.Features = Vector.Features[:] NewVector.TrueFlag = 0 FacFeatureSet.AllVectors.append(NewVector) FacFeatureSet.FalseVectors.append(NewVector) for (Score, Vector) in SortedList[-ChunkSize:]: NewVector = Learning.FeatureVector() NewVector.FileBits = Vector.FileBits[:] NewVector.Features = Vector.Features[:] NewVector.TrueFlag = 1 FacFeatureSet.AllVectors.append(NewVector) FacFeatureSet.TrueVectors.append(NewVector) FacFeatureSet.SetCounts() FacFeatureSet.GetPriorProbabilityFalse(self.TrainingSetDBRatio) ############################################################## # Write out the FACULTATIVE feature set: FacTrainingFile = open("FacultativeTrainingSet.txt", "wb") for HeaderLine in self.HeaderLines: FacTrainingFile.write(HeaderLine) for Vector in FacFeatureSet.AllVectors: Bits = Vector.FileBits[:] if Vector.TrueFlag: Bits[FormatBits.TrueProteinFlag] = "1" else: Bits[FormatBits.TrueProteinFlag] = "0" Str = string.join(Bits, "\t") FacTrainingFile.write(Str + "\n") FacTrainingFile.close() ############################################################## # Train the sub-model: self.FacModel = self.GetModelObject(self.FeaturesF) self.FacModel.Train(FacFeatureSet) self.FacModel.Test(FacFeatureSet) self.FacModel.ReportAccuracy(FacFeatureSet) # invokes ComputeOddsTrue ############################################################## # Apply the trained fac-model to *all* facultative features, and # train an overall model on all *constitutive* features: self.FeatureSetC = Learning.FeatureSetClass() self.FeatureSetF = Learning.FeatureSetClass() for Vector in self.TrainingSetAll.AllVectors: if Vector.FileBits[FormatBits.SisterAnnotationFlag]: FeatureSet = self.FeatureSetF else: FeatureSet = self.FeatureSetC FeatureSet.AllVectors.append(Vector) if Vector.TrueFlag: FeatureSet.TrueVectors.append(Vector) else: FeatureSet.FalseVectors.append(Vector) self.FeatureSetC.SetCounts() self.FeatureSetF.SetCounts() self.FeatureSetC.GetPriorProbabilityFalse(self.TrainingSetDBRatio) self.FeatureSetF.GetPriorProbabilityFalse(self.TrainingSetDBRatio) # Score facultative-feature, using facultative-model: self.FacModel.Test(self.FeatureSetF) # Train constitutive-ONLY model, and score constitutive features: self.ConModel = self.GetModelObject(self.FeaturesC) self.ConModel.Train(self.FeatureSetC) self.ConModel.Test(self.FeatureSetC) self.ConModel.ReportAccuracy(self.FeatureSetC) # to invoke ComputeOddsTrue ############################################################## # Save our models: if self.WriteModelFilePath: (Stub, Extension) = os.path.splitext(self.WriteModelFilePath) ConModelPath = "%s.con"%Stub FacModelPath = "%s.fac"%Stub self.ConModel.SaveModel(ConModelPath) self.FacModel.SaveModel(FacModelPath) ############################################################## # Write out the scored features: OutputFile = open(self.OutputFeaturePath, "wb") for Line in self.HeaderLines: OutputFile.write(Line) for Vector in self.TrainingSetAll.AllVectors: if Vector.FileBits[FormatBits.SisterAnnotationFlag]: PValue = self.FacModel.GetPValue(Vector.Score) else: PValue = self.ConModel.GetPValue(Vector.Score) while len(Vector.FileBits) <= FormatBits.ModelPValue: Vector.FileBits.append("") Vector.FileBits[FormatBits.ModelScore] = str(Vector.Score) Vector.FileBits[FormatBits.ModelPValue] = str(PValue) Str = string.join(Vector.FileBits, "\t") OutputFile.write(Str + "\n")