示例#1
0
    def get_phosphosequence(self):
        """
        Returns a sequence with all phosphorylated residues converted to E.

        Phosphorylated residues defined by the phosphosites object variable

        """

        if len(self.phosphosites) == 0:
            warning_message(
                "No phosphosites defined - phosphosequence will be equivalent to the unphosphorylated sequence")

        # first defined the empty sequence
        pseq = ""
        idx = 0

        # for each position if that residue is phosphorylatable set it to 'E' instead
        # of the actual Y/S/T
        for i in self.seq:
            if idx in self.phosphosites:

                # extra level of checking!
                if i not in ["S", "Y", "T"]:
                    raise SequenceException(
                        "In get_phosphosequence - trying to replace non-phsophrylatable residue with GLU")

                pseq = pseq + "E"
            else:
                pseq = pseq + self.seq[idx]

            idx = idx + 1
        return pseq
示例#2
0
    def setPhosPhoSites(self, listOfPsites):
        """
        Set one or more sites on your sequence which can be phosphorylated. Note that
        this indexes from 1 (like all of bioinformatics) and not from 0 (like all of
        computer science).

        i.e. "KKKYKKK" the Y here is at position 4

        Internally we do translate to indexing from 0, but this is not something you
        should have to worry about

        Note that all data validation for the phosphosite list is done in this function.

        """

        # if we passed a single value not in a list then convert to a list of length
        # 1
        if isinstance(listOfPsites, int):
            tmp = listOfPsites
            listOfPsites = []
            listOfPsites.appned(tmp)

        # evaluate proposed phosphosites
        for site in listOfPsites:

            # check we can convert to an integer!
            site = int(site)

            # python indexes from 0 but humans from 1
            idx = site - 1

            # if we're outside our sequence
            if idx >= len(self.seq) or idx < 0:
                warning_message("Proposed phosphosite (" + str(idx + 1) +
                                " is outside sequence range. Skipping...")
                pass

            # grab the residue letter from the sequence
            res = self.seq[idx]

            status_message("Setting " + res + str(idx + 1))

            if res not in ["S", "T", "Y"]:
                # we skip it if it seems like an unphosphorylatable residue
                warning_message(
                    'Position ' +
                    str(site) +
                    ' in sequence is a non phosphorylatable residue [' +
                    str(res) +
                    ']')
            else:
                if idx in self.phosphosites:
                    # don't add the same residue twice, but no need to warn
                    # about it
                    pass
                else:
                    # let's add that bad-boy!
                    self.phosphosites.append(idx)
示例#3
0
    def __validSeq(self, sequence):
        """
        Internal function which validates if a [region of]
        a sequence is a valid protein sequence.

        The validation skips spaces and numbers, but will raise an exception on any other character

        """

        parsed_seq = ""

        # for each residue in the sequence
        for i in sequence:

            # if the residue is not in the three letter code
            if i not in ONE_TO_THREE.keys():
                if i == " ":
                    # skip spaces
                    continue

                elif i == "*":
                    # Add * for now but then validate at the end (* can be a stop codon)
                    parsed_seq = parsed_seq + i                    
                    continue
                elif i in "1234567890":
                    warning_message(
                        "Found '" + i + "' in sequence, stripping out and ignoring...")
                    # strip out numbers (useful for copy/pasted FASTA formats)
                    continue
                else:
                    raise SequenceFileParserException(
                        "\n\nERROR: Invalid sequence file, found [" +
                        i +
                        "] in sequence region\n\n" +
                        sequence +
                        "\n\n")
            # if the residue *is* one of the 20 AAs then append to the growing
            # sequence
            else:
                parsed_seq = parsed_seq + i
        return parsed_seq
示例#4
0
    def kappa(self):
        """
        Return the kappa value, as defined in REF 1 \
        """

        if self.deltaMax() == 0:
            warning_message(
                "The sequence has no charged residues - kappa is not a valid/relevant parameter")
            return -1
        else:

            kappaVal = self.delta() / self.deltaMax()

            # so the heuristics for kappa are good BUT may under estimate
            # deltaMax is some cases. If this is a small deviation then we
            # just set it to 0 because the sequence with the highest delta is probably
            # an sequence-Isomer of the sequence we have. If this deviation is larger,
            # however, it may be indicative of a bug in the code which we
            # should address
            if kappaVal > 1.0 and kappaVal < 1.1:
                return 1.0
            else:
                return kappaVal
示例#5
0
    def validateSequence(self, seq):

        processed = ""

        AAs = data.aminoacids.ONE_TO_THREE.keys()
        pos = 0
        messageWarned = False

        # for each residue in your protein sequence
        for i in seq:
            pos = pos + 1
            if i not in AAs:

                # if we find whitespace
                if i.isspace():
                    if not messageWarned:
                        # only warn once...
                        status_message("Removing whitespace from sequence")
                        messageWarned = True
                    pass
                # if unexpected residue/character bail
                else:
                    raise SequenceException(
                        "Invalid amino acid [" + str(i) + "] found at position " + str(pos))

            # else append sequence to the processed sequence
            else:
                processed = processed + i

        # determine proline content and warn if over 15%
        prolineContent = float(processed.count("P")) / float(len(processed))
        if prolineContent > 0.15:
            warning_message(
                "This sequence has a proline content of greater than 15%.\nThis may render some analyses [notably kappa and phase diagram predictions] incorrect")

        return processed
示例#6
0
    def __validSeq(self, sequence):
        """
        Internal function which validates if a [region of]
        a sequence is a valid protein sequence.

        The validation skips spaces and numbers, but will raise an exception on any other character

        """

        parsed_seq = ""

        # for each residue in the sequence
        for i in sequence:

            # if the residue is not in the three letter code
            if i not in ONE_TO_THREE.keys():
                if i == " ":
                    # skip spaces
                    continue
                elif i in "1234567890":
                    warning_message(
                        "Found '" + i + "' in sequence, stripping out and ignoring...")
                    # strip out numbers (useful for copy/pasted FASTA formats)
                    continue
                else:
                    raise SequenceFileParserException(
                        "\n\nERROR: Invalid sequence file, found [" +
                        i +
                        "] in sequence region\n\n" +
                        sequence +
                        "\n\n")
            # if the residue *is* one of the 20 AAs then append to the growing
            # sequence
            else:
                parsed_seq = parsed_seq + i
        return parsed_seq
示例#7
0
    def parse_keyfile(self, filename):
        """
        Function which takes a filename and parses it into the keyfile object for easy
        interaction with the file's content
        """

        status_message("Parsing keyfile...")
        status_message("---------------------------------------")

        SeqFileParser = SequenceFileParser()  # create a sequence file parsing object

        # read file to end
        with open(filename) as filehandle:
            content = filehandle.readlines()

        # [PHASE 1 START]
        # PARSE THE KEYFILE
        for line in content:
            line = line.strip()

            # if empty line
            if len(line) == 0:
                continue

            # comments in the keyfile
            if line[0] == "#":
                continue

            # if inline comment kill everything after the comment
            # character
            if len(line.split("#")) > 1:
                line = line.split("#")[0]

            # finally remove any other trailing whitespace
            line = line.strip()

            # split the remaining by whitespace
            line_list = line.split(" ")

            # now cycle over the first value in the whitelist splitted lits
            # and check if it matches one of the predefined KEYWORDS
            if line_list[0].strip() in self.KEYWORDS:

                # if we find a keyword and there's a single string after the keyword load it into the
                # KEYWORDS dictionary (i.e. this is what we expect!)
                if len(line_list) == 2:
                    self.KEYWORDS[line_list[0].strip()] = line_list[1].strip()
                # there was more than one whitespace seperated string after the
                # keyword - we basically fail at this
                else:
                    raise KeyFileException("Error: Found keyword " +
                                           str(line_list[0].strip()) +
                                           " but unable to parse associated value")
            else:
                warning_message(
                    "Found unexpected keyword [" + str(line_list[0].strip()) + "] - ignorning...")

        # Now add default for the sequene, which will hopefully be set in the
        # next section by parsing the sequencefile
        self.KEYWORDS["SEQUENCE"] = ""
        # [PHASE 1 END]

        status_message("---------------------------------------")
        status_message("Keyfile parsed!\n")
        status_message("Validating keyfile contents")
        status_message("---------------------------------------")

        # Having parsed the keyfile we now validate the keyfile so
        # we don't have to worry about validation later on

        # VALIDATE the parsed values
        # [PHASE 2 START]
        #

        for keyword in KEYWORD_LIST:
            # extract the value associated with each keyword in turn
            value = self.KEYWORDS[keyword]

            ##
            # SEQUENCE FILE VALIDATION AND PARSING
            ##
            if keyword == "SEQFILE":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No sequence file provided in keyfile (expecting keyword [SEQFILE])")
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")

                    # if its a file lets try and extract a sequence from it!
                    self.KEYWORDS[
                        "SEQUENCE"] = SeqFileParser.parseSeqFile(value)

                    # if we get here we *should* now have a sequence...
                    if self.KEYWORDS["SEQUENCE"] == "" or self.KEYWORDS[
                            "SEQUENCE"] is None:
                        raise KeyFileException(
                            "ERROR: No sequence was parsed from the sequence file...")

            ##
            # OUTPUT DIRECTORY VALIDATION
            ##
            elif keyword == "OUTDIR":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No output directory provided in keyfile (expecting keyword [OUTDIR]")
                else:
                    # creates the output directory if it doesn't already exist
                    if not os.path.exists(value):
                        status_message(
                            "Creating output directory " + str(value))
                        try:
                            os.makedirs(value)
                        except OSError as e:
                            print "----------------------------"
                            print ""
                            print "ERROR Creating output directory - do you have permission to create the directory [" + str(value) + "]"
                            print ""
                            print "----------------------------"
                            raise e
                    # if it does exist raise a quick warning
                    else:
                        # check if its empty
                        if len(os.listdir(value)) > 0:
                            warning_message(
                                "Output directory exists already and is not empty [RISK OF OVERWRITING!]")
                        else:
                            pass  # empty directory already exists - brilliant!

            ##
            # FREEZE FILE VALIDATION
            ##
            elif keyword == "FREEZE_FILE":
                if value == "":
                    pass  # no freeze file, no problem
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")
                    status_message("Using freeze file")
                    self.KEYWORDS[keyword] = value

            ##
            # WL TYPE
            ##
            elif keyword == "WL_TYPE":
                if value == "":
                    self.KEYWORDS[keyword] = DEFAULT_VALS[keyword]
                    status_message(
                        "Setting WL type to default [" + str(DEFAULT_VALS[keyword]) + "]")
                else:
                    if value in WL_TYPES:
                        self.KEYWORDS[keyword] = value
                        status_message(
                            "Setting WL type to keyfile defined [" + str(value) + "]")
                    else:
                        raise KeyFileException(
                            "Unexpected WL algorithm type selected " + str(value) + " ")

                self.KEYWORDS[keyword] = value

            ##
            # SET NUMERIC VALUES
            ##

            elif keyword == "BIN_MIN":
                self.__set_numeric(keyword, value)

            elif keyword == "BIN_MAX":
                self.__set_numeric(keyword, value)

            elif keyword == "NUMBER_OF_BINS":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATCHECK_FREQ":
                self.__set_numeric(keyword, value)

            elif keyword == "CONVERGENCE":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATNESS_CRITERION":
                self.__set_numeric(keyword, value)

            else:
                raise KeyFileException("SHOULD NOT BE GETTING HERE...")
示例#8
0
    def parse_keyfile(self, filename):
        """
        Function which takes a filename and parses it into the keyfile object for easy
        interaction with the file's content
        """

        status_message("Parsing keyfile...")
        status_message("---------------------------------------")

        SeqFileParser = SequenceFileParser()  # create a sequence file parsing object

        # read file to end
        with open(filename) as filehandle:
            content = filehandle.readlines()

        # [PHASE 1 START]
        # PARSE THE KEYFILE
        for line in content:
            line = line.strip()

            # if empty line
            if len(line) == 0:
                continue

            # comments in the keyfile
            if line[0] == "#":
                continue

            # if inline comment kill everything after the comment
            # character
            if len(line.split("#")) > 1:
                line = line.split("#")[0]

            # finally remove any other trailing whitespace
            line = line.strip()

            # split the remaining by whitespace
            line_list = line.split(" ")

            # now cycle over the first value in the whitelist splitted lits
            # and check if it matches one of the predefined KEYWORDS
            if line_list[0].strip() in self.KEYWORDS:

                # if we find a keyword and there's a single string after the keyword load it into the
                # KEYWORDS dictionary (i.e. this is what we expect!)
                if len(line_list) == 2:
                    self.KEYWORDS[line_list[0].strip()] = line_list[1].strip()
                # there was more than one whitespace seperated string after the
                # keyword - we basically fail at this
                else:
                    raise KeyFileException("Error: Found keyword " +
                                           str(line_list[0].strip()) +
                                           " but unable to parse associated value")
            else:
                warning_message(
                    "Found unexpected keyword [" + str(line_list[0].strip()) + "] - ignorning...")

        # Now add default for the sequene, which will hopefully be set in the
        # next section by parsing the sequencefile
        self.KEYWORDS["SEQUENCE"] = ""
        # [PHASE 1 END]

        status_message("---------------------------------------")
        status_message("Keyfile parsed!\n")
        status_message("Validating keyfile contents")
        status_message("---------------------------------------")

        # Having parsed the keyfile we now validate the keyfile so
        # we don't have to worry about validation later on

        # VALIDATE the parsed values
        # [PHASE 2 START]
        #

        for keyword in KEYWORD_LIST:
            # extract the value associated with each keyword in turn
            value = self.KEYWORDS[keyword]

            ##
            # SEQUENCE FILE VALIDATION AND PARSING
            ##
            if keyword == "SEQFILE":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No sequence file provided in keyfile (expecting keyword [SEQFILE])")
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")

                    # if its a file lets try and extract a sequence from it!
                    self.KEYWORDS[
                        "SEQUENCE"] = SeqFileParser.parseSeqFile(value)

                    # if we get here we *should* now have a sequence...
                    if self.KEYWORDS["SEQUENCE"] == "" or self.KEYWORDS[
                            "SEQUENCE"] is None:
                        raise KeyFileException(
                            "ERROR: No sequence was parsed from the sequence file...")

            ##
            # OUTPUT DIRECTORY VALIDATION
            ##
            elif keyword == "OUTDIR":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No output directory provided in keyfile (expecting keyword [OUTDIR]")
                else:
                    # creates the output directory if it doesn't already exist
                    if not os.path.exists(value):
                        status_message(
                            "Creating output directory " + str(value))
                        try:
                            os.makedirs(value)
                        except OSError as e:
                            print "----------------------------"
                            print ""
                            print "ERROR Creating output directory - do you have permission to create the directory [" + str(value) + "]"
                            print ""
                            print "----------------------------"
                            raise e
                    # if it does exist raise a quick warning
                    else:
                        # check if its empty
                        if len(os.listdir(value)) > 0:
                            warning_message(
                                "Output directory exists already and is not empty [RISK OF OVERWRITING!]")
                        else:
                            pass  # empty directory already exists - brilliant!

            ##
            # FREEZE FILE VALIDATION
            ##
            elif keyword == "FREEZE_FILE":
                if value == "":
                    pass  # no freeze file, no problem
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")
                    status_message("Using freeze file")
                    self.KEYWORDS[keyword] = value

            ##
            # WL TYPE
            ##
            elif keyword == "WL_TYPE":
                if value == "":
                    self.KEYWORDS[keyword] = DEFAULT_VALS[keyword]
                    status_message(
                        "Setting WL type to default [" + str(DEFAULT_VALS[keyword]) + "]")
                else:
                    if value in WL_TYPES:
                        self.KEYWORDS[keyword] = value
                        status_message(
                            "Setting WL type to keyfile defined [" + str(value) + "]")
                    else:
                        raise KeyFileException(
                            "Unexpected WL algorithm type selected " + str(value) + " ")

                self.KEYWORDS[keyword] = value

            ##
            # SET NUMERIC VALUES
            ##

            elif keyword == "BIN_MIN":
                self.__set_numeric(keyword, value)

            elif keyword == "BIN_MAX":
                self.__set_numeric(keyword, value)

            elif keyword == "NUMBER_OF_BINS":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATCHECK_FREQ":
                self.__set_numeric(keyword, value)

            elif keyword == "CONVERGENCE":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATNESS_CRITERION":
                self.__set_numeric(keyword, value)

            else:
                raise KeyFileException("SHOULD NOT BE GETTING HERE...")