def setPhosPhoSites(self, listOfPsites): """ Set one or more sites on your sequence which can be phosphorylated. Note that this indexes from 1 (like all of bioinformatics) and not from 0 (like all of computer science). i.e. "KKKYKKK" the Y here is at position 4 Internally we do translate to indexing from 0, but this is not something you should have to worry about Note that all data validation for the phosphosite list is done in this function. """ # if we passed a single value not in a list then convert to a list of length # 1 if isinstance(listOfPsites, int): tmp = listOfPsites listOfPsites = [] listOfPsites.appned(tmp) # evaluate proposed phosphosites for site in listOfPsites: # check we can convert to an integer! site = int(site) # python indexes from 0 but humans from 1 idx = site - 1 # if we're outside our sequence if idx >= len(self.seq) or idx < 0: warning_message("Proposed phosphosite (" + str(idx + 1) + " is outside sequence range. Skipping...") pass # grab the residue letter from the sequence res = self.seq[idx] status_message("Setting " + res + str(idx + 1)) if res not in ["S", "T", "Y"]: # we skip it if it seems like an unphosphorylatable residue warning_message( 'Position ' + str(site) + ' in sequence is a non phosphorylatable residue [' + str(res) + ']') else: if idx in self.phosphosites: # don't add the same residue twice, but no need to warn # about it pass else: # let's add that bad-boy! self.phosphosites.append(idx)
def parseSeqFile(self, filename, silent=False): """ The parseSeqFile function is the meat of the SequenceFileParser object, and carrys out stateless parsing of a sequence file to a single, unbroken string which contains only valid amino acids. INPUT: Filename | Name of a file to parse (string) Silent | Defines if the parsing operation should be Silent, or if there should be messages prinited to screen OUTPUT: Amino acid sequence in a string """ # read file to end with open(filename) as filehandle: content = filehandle.readlines() header = False seq = "" # cycle over each line in the file for line in content: line = line.strip() # empty line if len(line) == 0: continue # if you have a header line if line[0] == ">": # if the header flag had already been flicked then raise an # exception (indicative of multiple sequences in a single # file) if header: raise SequenceFileParserException( "\n\nERROR: During parsing of sequence file found a second header section. Sequence files must be a single file") # if it has not, flick the header flag to on header = True continue elif len(line) > 0: # validate sequence (raises exception if something is bad) and # append to the growing sequence string line = self.__validSeq(line) seq = seq + line seq = self.__final_validation(seq) if not silent: status_message( "Parsed sequence [" + str(len(seq)) + " residues]:\n" + seq) return seq
def parseSeqFile(self, filename, silent=False): """ The parseSeqFile function is the meat of the SequenceFileParser object, and carrys out stateless parsing of a sequence file to a single, unbroken string which contains only valid amino acids. INPUT: Filename | Name of a file to parse (string) Silent | Defines if the parsing operation should be Silent, or if there should be messages prinited to screen OUTPUT: Amino acid sequence in a string """ # read file to end with open(filename) as filehandle: content = filehandle.readlines() header = False seq = "" # cycle over each line in the file for line in content: line = line.strip() # empty line if len(line) == 0: continue # if you have a header line if line[0] == ">": # if the header flag had already been flicked then raise an # exception (indicative of multiple sequences in a single # file) if header: raise SequenceFileParserException( "\n\nERROR: During parsing of sequence file found a second header section. Sequence files must be a single file") # if it has not, flick the header flag to on header = True continue elif len(line) > 0: # validate sequence (raises exception if something is bad) and # append to the growing sequence string line = self.__validSeq(line) seq = seq + line if not silent: status_message( "Parsed sequence [" + str(len(seq)) + " residues]:\n" + seq) return seq
def validateSequence(self, seq): processed = "" AAs = data.aminoacids.ONE_TO_THREE.keys() pos = 0 messageWarned = False # for each residue in your protein sequence for i in seq: pos = pos + 1 if i not in AAs: # if we find whitespace if i.isspace(): if not messageWarned: # only warn once... status_message("Removing whitespace from sequence") messageWarned = True pass # if unexpected residue/character bail else: raise SequenceException( "Invalid amino acid [" + str(i) + "] found at position " + str(pos)) # else append sequence to the processed sequence else: processed = processed + i # determine proline content and warn if over 15% prolineContent = float(processed.count("P")) / float(len(processed)) if prolineContent > 0.15: warning_message( "This sequence has a proline content of greater than 15%.\nThis may render some analyses [notably kappa and phase diagram predictions] incorrect") return processed
def __set_numeric(self, keyword, value): """ Function which sets the KEYWORDS dictionary $keyword value to $value if $value can be treated as a numerical value or uses the default if it wasn't set (BUT DOES NOT use the default if an in-parsable value was set - we want to know when things are going wrong, silent errors cost lives. Maybe.). """ if value == "": status_message("Setting " + keyword + " to default [" + str(DEFAULT_VALS[keyword]) + "]") self.KEYWORDS[keyword] = DEFAULT_VALS[keyword] else: try: float(value) status_message("Setting " + keyword + " to keyfile defined [" + str(value) + "]") self.KEYWORDS[keyword] = value except ValueError: raise KeyFileException( "\n\nERROR: Invalid value for " + keyword + " - unable to convert [" + value + "] into a number\n")
def swapRandChargeRes(self, frozen=set()): """ Function which randomly selects two residues and swaps them if that swap would change the kappa value """ # get a random number rand = rng.Random() rand.seed(time.time()) # determine the indices from which we can swap # (i.e. all positive indices which do not overlap with the set of frozen # residues) posInd = set(np.where(self.chargePattern > 0)[0]) - frozen negInd = set(np.where(self.chargePattern < 0)[0]) - frozen neutInd = set(np.where(self.chargePattern == 0)[0]) - frozen if(len(neutInd) == 0): if(len(posInd) == 0 or len(negInd) == 0): status_message( 'swap will not change kappa, only one charge type in sequence') return self else: chargeType = [1, 2] elif(len(negInd) == 0): if(len(posInd) == 0 or len(neutInd) == 0): status_message( 'swap will not change kappa, only one charge type in sequence') return self else: chargeType = [1, 3] elif(len(posInd) == 0): if(len(negInd) == 0 or len(neutInd) == 0): status_message( 'swap will not change kappa, only one charge type in sequence') return self else: chargeType = [2, 3] else: chargeType = rand.sample([1, 2, 3], 2) if(chargeType[0] == 1): swapPair1 = rand.sample(posInd, 1) elif(chargeType[0] == 2): swapPair1 = rand.sample(negInd, 1) elif(chargeType[0] == 3): swapPair1 = rand.sample(neutInd, 1) if(chargeType[1] == 1): swapPair2 = rand.sample(posInd, 1) elif(chargeType[1] == 2): swapPair2 = rand.sample(negInd, 1) elif(chargeType[1] == 3): swapPair2 = rand.sample(neutInd, 1) return self.swapRes(swapPair1[0], swapPair2[0])
def print_progress(count, total): if (count % 50) == 0: status_message("Done " + str(count) + " of " + str(total))
def parse_keyfile(self, filename): """ Function which takes a filename and parses it into the keyfile object for easy interaction with the file's content """ status_message("Parsing keyfile...") status_message("---------------------------------------") SeqFileParser = SequenceFileParser() # create a sequence file parsing object # read file to end with open(filename) as filehandle: content = filehandle.readlines() # [PHASE 1 START] # PARSE THE KEYFILE for line in content: line = line.strip() # if empty line if len(line) == 0: continue # comments in the keyfile if line[0] == "#": continue # if inline comment kill everything after the comment # character if len(line.split("#")) > 1: line = line.split("#")[0] # finally remove any other trailing whitespace line = line.strip() # split the remaining by whitespace line_list = line.split(" ") # now cycle over the first value in the whitelist splitted lits # and check if it matches one of the predefined KEYWORDS if line_list[0].strip() in self.KEYWORDS: # if we find a keyword and there's a single string after the keyword load it into the # KEYWORDS dictionary (i.e. this is what we expect!) if len(line_list) == 2: self.KEYWORDS[line_list[0].strip()] = line_list[1].strip() # there was more than one whitespace seperated string after the # keyword - we basically fail at this else: raise KeyFileException("Error: Found keyword " + str(line_list[0].strip()) + " but unable to parse associated value") else: warning_message( "Found unexpected keyword [" + str(line_list[0].strip()) + "] - ignorning...") # Now add default for the sequene, which will hopefully be set in the # next section by parsing the sequencefile self.KEYWORDS["SEQUENCE"] = "" # [PHASE 1 END] status_message("---------------------------------------") status_message("Keyfile parsed!\n") status_message("Validating keyfile contents") status_message("---------------------------------------") # Having parsed the keyfile we now validate the keyfile so # we don't have to worry about validation later on # VALIDATE the parsed values # [PHASE 2 START] # for keyword in KEYWORD_LIST: # extract the value associated with each keyword in turn value = self.KEYWORDS[keyword] ## # SEQUENCE FILE VALIDATION AND PARSING ## if keyword == "SEQFILE": if value == "": raise KeyFileException( "ERROR: No sequence file provided in keyfile (expecting keyword [SEQFILE])") else: if not os.path.isfile(value): raise KeyFileException( "Expected " + str(value) + " to be file") # if its a file lets try and extract a sequence from it! self.KEYWORDS[ "SEQUENCE"] = SeqFileParser.parseSeqFile(value) # if we get here we *should* now have a sequence... if self.KEYWORDS["SEQUENCE"] == "" or self.KEYWORDS[ "SEQUENCE"] is None: raise KeyFileException( "ERROR: No sequence was parsed from the sequence file...") ## # OUTPUT DIRECTORY VALIDATION ## elif keyword == "OUTDIR": if value == "": raise KeyFileException( "ERROR: No output directory provided in keyfile (expecting keyword [OUTDIR]") else: # creates the output directory if it doesn't already exist if not os.path.exists(value): status_message( "Creating output directory " + str(value)) try: os.makedirs(value) except OSError as e: print "----------------------------" print "" print "ERROR Creating output directory - do you have permission to create the directory [" + str(value) + "]" print "" print "----------------------------" raise e # if it does exist raise a quick warning else: # check if its empty if len(os.listdir(value)) > 0: warning_message( "Output directory exists already and is not empty [RISK OF OVERWRITING!]") else: pass # empty directory already exists - brilliant! ## # FREEZE FILE VALIDATION ## elif keyword == "FREEZE_FILE": if value == "": pass # no freeze file, no problem else: if not os.path.isfile(value): raise KeyFileException( "Expected " + str(value) + " to be file") status_message("Using freeze file") self.KEYWORDS[keyword] = value ## # WL TYPE ## elif keyword == "WL_TYPE": if value == "": self.KEYWORDS[keyword] = DEFAULT_VALS[keyword] status_message( "Setting WL type to default [" + str(DEFAULT_VALS[keyword]) + "]") else: if value in WL_TYPES: self.KEYWORDS[keyword] = value status_message( "Setting WL type to keyfile defined [" + str(value) + "]") else: raise KeyFileException( "Unexpected WL algorithm type selected " + str(value) + " ") self.KEYWORDS[keyword] = value ## # SET NUMERIC VALUES ## elif keyword == "BIN_MIN": self.__set_numeric(keyword, value) elif keyword == "BIN_MAX": self.__set_numeric(keyword, value) elif keyword == "NUMBER_OF_BINS": self.__set_numeric(keyword, value) elif keyword == "FLATCHECK_FREQ": self.__set_numeric(keyword, value) elif keyword == "CONVERGENCE": self.__set_numeric(keyword, value) elif keyword == "FLATNESS_CRITERION": self.__set_numeric(keyword, value) else: raise KeyFileException("SHOULD NOT BE GETTING HERE...")