Exemplo n.º 1
0
 def __init__(self, prot_sequence, monoisotopic=False):
     if prot_sequence.islower():
         self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
     else:
         self.sequence = Seq(prot_sequence, IUPAC.protein)
     self.amino_acids_content = None
     self.amino_acids_percent = None
     self.length = len(self.sequence)
     self.monoisotopic = monoisotopic
Exemplo n.º 2
0
 def __init__(self, prot_sequence, monoisotopic=False):
     if prot_sequence.islower():
         self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
     else:
         self.sequence = Seq(prot_sequence, IUPAC.protein)
     self.amino_acids_content = None
     self.amino_acids_percent = None
     self.length = len(self.sequence)
     self.monoisotopic = monoisotopic
Exemplo n.º 3
0
def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
    """Generator function to iterate over Fasta records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet
    title2ids - A function that, when given the title of the FASTA
    file (without the beginning >), will return the id, name and
    description (in that order) for the record as a tuple of strings.

    If this is not given, then the entire title line will be used
    as the description, and the first word as the id and name.

    By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
    with no custom handling of the title lines:

    >>> with open("Fasta/dups.fasta") as handle:
    ...     for record in FastaIterator(handle):
    ...         print(record.id)
    ...
    alpha
    beta
    gamma
    alpha
    delta

    However, you can supply a title2ids function to alter this:

    >>> def take_upper(title):
    ...     return title.split(None, 1)[0].upper(), "", title
    >>> with open("Fasta/dups.fasta") as handle:
    ...     for record in FastaIterator(handle, title2ids=take_upper):
    ...         print(record.id)
    ...
    ALPHA
    BETA
    GAMMA
    ALPHA
    DELTA

    """
    if title2ids:
        for title, sequence in SimpleFastaParser(handle):
            id, name, descr = title2ids(title)
            yield SeqRecord(Seq(sequence, alphabet),
                            id=id, name=name, description=descr)
    else:
        for title, sequence in SimpleFastaParser(handle):
            try:
                first_word = title.split(None, 1)[0]
            except IndexError:
                assert not title, repr(title)
                #Should we use SeqRecord default for no ID?
                first_word = ""
            yield SeqRecord(Seq(sequence, alphabet),
                            id=first_word, name=first_word, description=title)
Exemplo n.º 4
0
    def _from_jaspar_sites(self, stream):
        """
        reads the motif from Jaspar .sites file

        The instances and pwm are OK.
        """
        
        while True:
            ln = stream.readline()# read the header "$>...."
            if ln=="" or ln[0]!=">":
                break
            
            ln=stream.readline().strip()#read the actual sequence
            i=0
            while ln[i]==ln[i].lower():
                i+=1
            inst=""
            while i<len(ln) and ln[i]==ln[i].upper():
                inst+=ln[i]
                i+=1
            inst=Seq(inst, self.alphabet)                
            self.add_instance(inst)

        self.set_mask("*"*len(inst))
        return self
Exemplo n.º 5
0
    def make_instances_from_counts(self):
        """Creates "fake" instances for a motif created from a count matrix.

        In case the sums of counts are different for different columnes, the
        shorter columns are padded with background.
        """
        alpha = "".join(self.alphabet.letters)
        #col[i] is a column taken from aligned motif instances
        col = []
        self.has_instances = True
        self.instances = []
        s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters)
        for i in range(self.length):
            col.append("")
            for n in self.alphabet.letters:
                col[i] = col[i] + n*(self.counts[n][i])
            if len(col[i]) < s:
                print("WARNING, column too short %i %i" % (len(col[i]), s))
                col[i] += (alpha*s)[:(s-len(col[i]))]
            #print("column %i, %s" % (i, col[i]))
        #iterate over instances
        for i in range(s): 
            inst = "" #start with empty seq
            for j in range(self.length): #iterate over positions
                inst += col[j][i]
            #print("%i %s" % (i,inst)
            inst = Seq(inst, self.alphabet)                
            self.add_instance(inst)
        return self.instances
Exemplo n.º 6
0
def read(handle, format):
    alphabet = IUPAC.unambiguous_dna
    counts = {}
    if format == "pfm":
        # reads the motif from Jaspar .pfm file
        letters = "ACGT"
        for letter, line in zip(letters, handle):
            words = line.split()
            #if there is a letter in the beginning, ignore it
            if words[0] == letter:
                words = words[1:]
            counts[letter] = map(float, words)
        motif = Motif(alphabet, counts=counts)
    elif format == "sites":
        # reads the motif from Jaspar .sites file
        instances = []
        for line in handle:
            if not line.startswith(">"):
                break
            # line contains the header ">...."
            # now read the actual sequence
            line = handle.next()
            instance = ""
            for c in line.strip():
                if c == c.upper():
                    instance += c
            instance = Seq(instance, alphabet)
            instances.append(instance)
        instances = Instances(instances, alphabet)
        motif = Motif(alphabet, instances=instances)
    else:
        raise ValueError("Unknown format %s" % format)
    motif.mask = "*" * motif.length
    return motif
Exemplo n.º 7
0
def read(handle):
    """read(handle)"""
    record = Record()
    record.ver = next(handle)
    record.cmd_line = next(handle)
    for line in handle:
        if line.strip() == "":
            pass
        elif line[:4] == "Para":
            record.param_dict = {}
        elif line[0] == "#":
            seq_name = line.split("\t")[1]
            record.seq_dict.append(seq_name)
        elif "=" in line:
            par_name = line.split("=")[0].strip()
            par_value = line.split("=")[1].strip()
            record.param_dict[par_name] = par_value
        elif line[:5] == "Input":
            record.seq_dict = []
        elif line[:5] == "Motif":
            record.current_motif = Motif()
            record.motifs.append(record.current_motif)
            record.current_motif.alphabet = IUPAC.unambiguous_dna
        elif line[:3] == "MAP":
            record.current_motif.score = float(line.split()[-1])
        elif len(line.split("\t")) == 4:
            seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna)
            record.current_motif.add_instance(seq)
        elif "*" in line:
            record.current_motif.set_mask(line.strip("\n\c"))
        else:
            raise ValueError(line)
    return record
Exemplo n.º 8
0
    def read(self, input_handle):
        """Read patterns from the specified handle.
        """
        all_patterns = []

        while True:
            cur_line = input_handle.readline()

            if not (cur_line):
                break

            cur_pattern = cur_line.rstrip()
            # split up signatures
            if self.separator in cur_pattern:
                cur_pattern = tuple(cur_pattern.split(self.separator))

            if self._alphabet is not None:
                # make single patterns (not signatures) into lists, so we
                # can check signatures and single patterns the same
                if not isinstance(cur_pattern, tuple):
                    test_pattern = [cur_pattern]
                else:
                    test_pattern = cur_pattern
                for pattern_item in test_pattern:
                    pattern_seq = Seq(pattern_item, self._alphabet)
                    if not (_verify_alphabet(pattern_seq)):
                        raise ValueError(
                            "Pattern %s not matching alphabet %s" %
                            (cur_pattern, self._alphabet))

            all_patterns.append(cur_pattern)

        return all_patterns
Exemplo n.º 9
0
 def __init__(self, instances=[], alphabet=None):
     from SAP.Bio.Alphabet import IUPAC
     from SAP.Bio.Seq import Seq
     self.length = None
     for instance in instances:
         if self.length is None:
             self.length = len(instance)
         elif self.length != len(instance):
             message = "All instances should have the same length (%d found, %d expected)" % (
                 len(instance), self.length)
             raise ValueError(message)
         try:
             a = instance.alphabet
         except AttributeError:
             # The instance is a plain string
             continue
         if alphabet is None:
             alphabet = a
         elif alphabet != a:
             raise ValueError("Alphabets are inconsistent")
     if alphabet is None or alphabet.letters is None:
         # If we didn't get a meaningful alphabet from the instances,
         # assume it is DNA.
         alphabet = IUPAC.unambiguous_dna
     for instance in instances:
         if not isinstance(instance, Seq):
             sequence = str(instance)
             instance = Seq(sequence, alphabet=alphabet)
         self.append(instance)
     self.alphabet = alphabet
Exemplo n.º 10
0
def _read_sites(handle):
    """ Read the motif from JASPAR .sites file. """

    alphabet = dna
    instances = []

    for line in handle:
        if not line.startswith(">"):
            break
        # line contains the header ">...."
        # now read the actual sequence
        line = next(handle)
        instance = ""
        for c in line.strip():
            if c == c.upper():
                instance += c
        instance = Seq(instance, alphabet)
        instances.append(instance)

    instances = motifs.Instances(instances, alphabet)
    motif = Motif(matrix_id=None,
                  name=None,
                  alphabet=alphabet,
                  instances=instances)
    motif.mask = "*" * motif.length
    record = Record()
    record.append(motif)

    return record
Exemplo n.º 11
0
    def _elem_AAseq(self, node, record):
        """Parse protein sequence."""

        if not (node.hasChildNodes() and len(node.firstChild.data) > 0):
            raise ValueError("Sequence length should be greater than 0.")

        record.seq = Seq(node.firstChild.data, Alphabet.generic_protein)
Exemplo n.º 12
0
def TabIterator(handle, alphabet=single_letter_alphabet):
    """Iterates over tab separated lines (as SeqRecord objects).

    Each line of the file should contain one tab only, dividing the line
    into an identifier and the full sequence.

    handle - input file
    alphabet - optional alphabet

    The first field is taken as the record's .id and .name (regardless of
    any spaces within the text) and the second field is the sequence.

    Any blank lines are ignored.
    """
    for line in handle:
        try:
            title, seq = line.split("\t")  # will fail if more than one tab!
        except:
            if line.strip() == "":
                #It's a blank line, ignore it
                continue
            raise ValueError("Each line should have one tab separating the" +
                             " title and sequence, this line has %i tabs: %s" %
                             (line.count("\t"), repr(line)))
        title = title.strip()
        seq = seq.strip()  # removes the trailing new line
        yield SeqRecord(Seq(seq, alphabet),
                        id=title,
                        name=title,
                        description="")
Exemplo n.º 13
0
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from SAP.Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
Exemplo n.º 14
0
    def __init__(self, data='', alphabet=default_codon_alphabet, \
            gap_char="-", rf_table=None):
        # rf_table should be a tuple or list indicating the every
        # codon position along the sequence. For example:
        # sequence = 'AAATTTGGGCCAAATTT'
        # rf_table = (0, 3, 6, 8, 11, 14)
        # the translated protein sequences will be
        # AAA TTT GGG GCC AAA TTT
        #  K   F   G   A   K   F
        # Notice: rf_table applies to ungapped sequence. If there
        #   are gaps in the sequence, they will be discarded. This
        #   feature ensures the rf_table is independent of where the
        #   codon sequence appears in the alignment

        Seq.__init__(self, data.upper(), alphabet=alphabet)
        self.gap_char = gap_char

        # check the length of the alignment to be a triple
        if rf_table is None:
            seq_ungapped = self._data.replace(gap_char, "")
            assert len(self) % 3 == 0, "Sequence length is not a triple number"
            self.rf_table = list(filter(lambda x: x%3 == 0,
                                        range(len(seq_ungapped))))
            # check alphabet
            # Not use Alphabet._verify_alphabet function because it 
            # only works for single alphabet
            for i in self.rf_table:
                if self._data[i:i+3] not in alphabet.letters:
                    raise ValueError("Sequence contain undefined letters from"
                                     " alphabet "
                                     "({0})! ".format(self._data[i:i+3]))
        else:
            #if gap_char in self._data:
            #    assert  len(self) % 3 == 0, \
            #            "Gapped sequence length is not a triple number"
            assert isinstance(rf_table, (tuple, list)), \
                    "rf_table should be a tuple or list object"
            assert all(isinstance(i, int) for i in rf_table), \
                    "elements in rf_table should be int that specify " \
                  + "the codon positions of the sequence"
            seq_ungapped = self._data.replace(gap_char, "")
            for i in rf_table:
                if seq_ungapped[i:i+3] not in alphabet.letters:
                    raise ValueError("Sequence contain undefined letters "
                                     "from alphabet "
                                     "({0})!".format(seq_ungapped[i:i+3]))
            self.rf_table = rf_table
Exemplo n.º 15
0
    def _get_signature_dict(self, seq_records, sig_size, max_gap):
        """Return a dictionary with all signatures and their counts.

        This internal function does all of the hard work for the
        find_signatures function.
        """
        if self._alphabet_strict:
            alphabet = seq_records[0].seq.alphabet
        else:
            alphabet = None

        # loop through all records to find signatures
        all_sigs = {}
        for seq_record in seq_records:
            # if we are working with alphabets, make sure we are consistent
            if alphabet is not None:
                assert seq_record.seq.alphabet == alphabet, \
                       "Working with alphabet %s and got %s" % \
                       (alphabet, seq_record.seq.alphabet)

            # now start finding signatures in the sequence
            largest_sig_size = sig_size * 2 + max_gap
            for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
                # find the first part of the signature
                first_sig = str(seq_record.seq[start:start + sig_size])

                # now find all of the second parts of the signature
                for second in range(start + 1, (start + 1) + max_gap):
                    second_sig = str(seq_record.seq[second:second + sig_size])

                    # if we are being alphabet strict, make sure both parts
                    # of the sig fall within the specified alphabet
                    if alphabet is not None:
                        first_seq = Seq(first_sig, alphabet)
                        second_seq = Seq(second_sig, alphabet)
                        if _verify_alphabet(first_seq) \
                        and _verify_alphabet(second_seq):
                            all_sigs = self._add_sig(all_sigs,
                                                     (first_sig, second_sig))

                    # if we are not being strict, just add the motif
                    else:
                        all_sigs = self._add_sig(all_sigs,
                                                 (first_sig, second_sig))

        return all_sigs
Exemplo n.º 16
0
 def getSeqBySid(self, domain):
     """get the seq record of a given domain from its sid"""
     if self.db_handle is None:
         return self.fasta_dict[domain].seq
     else:
         cur = self.db_handle.cursor()
         cur.execute("SELECT seq FROM astral WHERE sid=%s", domain)
         return Seq(cur.fetchone()[0])
Exemplo n.º 17
0
    def gap_consensus(self,
                      threshold=.7,
                      ambiguous="X",
                      consensus_alpha=None,
                      require_multiple=0):
        """Same as dumb_consensus(), but allows gap on the output.

        Things to do: Let the user define that with only one gap, the result
        character in consensus is gap. Let the user select gap character, now
        it takes the same is input.
        """
        # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
        consensus = ''

        # find the length of the consensus we are creating
        con_len = self.alignment.get_alignment_length()

        # go through each seq item
        for n in range(con_len):
            # keep track of the counts of the different atoms we get
            atom_dict = {}
            num_atoms = 0

            for record in self.alignment._records:
                # make sure we haven't run past the end of any sequences
                # if they are of different lengths
                if n < len(record.seq):
                    if record.seq[n] not in atom_dict:
                        atom_dict[record.seq[n]] = 1
                    else:
                        atom_dict[record.seq[n]] += 1

                    num_atoms += 1

            max_atoms = []
            max_size = 0

            for atom in atom_dict:
                if atom_dict[atom] > max_size:
                    max_atoms = [atom]
                    max_size = atom_dict[atom]
                elif atom_dict[atom] == max_size:
                    max_atoms.append(atom)

            if require_multiple and num_atoms == 1:
                consensus += ambiguous
            elif (len(max_atoms) == 1) and (
                (float(max_size) / float(num_atoms)) >= threshold):
                consensus += max_atoms[0]
            else:
                consensus += ambiguous

        # we need to guess a consensus alphabet if one isn't specified
        if consensus_alpha is None:
            #TODO - Should we make this into a Gapped alphabet?
            consensus_alpha = self._guess_consensus_alphabet(ambiguous)

        return Seq(consensus, consensus_alpha)
Exemplo n.º 18
0
    def get_sequence(self):
        """Return the AA sequence as a Seq object.

        @return: polypeptide sequence
        @rtype: L{Seq}
        """
        s = ""
        for res in self:
            s += SCOPData.protein_letters_3to1.get(res.get_resname(), 'X')
        seq = Seq(s, generic_protein)
        return seq
Exemplo n.º 19
0
    def add_sequence(self,
                     descriptor,
                     sequence,
                     start=None,
                     end=None,
                     weight=1.0):
        """Add a sequence to the alignment.

        This doesn't do any kind of alignment, it just adds in the sequence
        object, which is assumed to be prealigned with the existing
        sequences.

        Arguments:
         - descriptor - The descriptive id of the sequence being added.
                       This will be used as the resulting SeqRecord's
                       .id property (and, for historical compatibility,
                       also the .description property)
         - sequence - A string with sequence info.
         - start - You can explicitly set the start point of the sequence.
                   This is useful (at least) for BLAST alignments, which can
                   just be partial alignments of sequences.
         - end - Specify the end of the sequence, which is important
                 for the same reason as the start.
         - weight - The weight to place on the sequence in the alignment.
                    By default, all sequences have the same weight. (0.0 =>
                    no weight, 1.0 => highest weight)
        """
        new_seq = Seq(sequence, self._alphabet)

        #We are now effectively using the SeqRecord's .id as
        #the primary identifier (e.g. in Bio.SeqIO) so we should
        #populate it with the descriptor.
        #For backwards compatibility, also store this in the
        #SeqRecord's description property.
        new_record = SeqRecord(new_seq, id=descriptor, description=descriptor)

        # hack! We really need to work out how to deal with annotations
        # and features in biopython. Right now, I'll just use the
        # generic annotations dictionary we've got to store the start
        # and end, but we should think up something better. I don't know
        # if I'm really a big fan of the LocatableSeq thing they've got
        # in BioPerl, but I'm not positive what the best thing to do on
        # this is...
        if start:
            new_record.annotations['start'] = start
        if end:
            new_record.annotations['end'] = end

        # another hack to add weight information to the sequence
        new_record.annotations['weight'] = weight

        self._records.append(new_record)
Exemplo n.º 20
0
 def consensus(self):
     """Returns the consensus sequence of a motif.
     """
     res=""
     for i in range(self.length):
         max_f=0
         max_n="X"
         for n in sorted(self[i]):
             if self[i][n]>max_f:
                 max_f=self[i][n]
                 max_n=n
         res+=max_n
     return Seq(res, self.alphabet)
Exemplo n.º 21
0
 def anticonsensus(self):
     """returns the least probable pattern to be generated from this motif.
     """
     res=""
     for i in range(self.length):
         min_f=10.0
         min_n="X"
         for n in sorted(self[i]):
             if self[i][n]<min_f:
                 min_f=self[i][n]
                 min_n=n
         res+=min_n
     return Seq(res, self.alphabet)
Exemplo n.º 22
0
    def _read(self, stream):
        """Reads the motif from the stream (in AlignAce format).

        the self.alphabet variable must be set beforehand.
        If the last line contains asterisks it is used for setting mask
        """
        
        while True:
            ln = stream.readline()
            if "*" in ln:
                self.set_mask(ln.strip("\n\c"))
                break
            self.add_instance(Seq(ln.strip(), self.alphabet))
Exemplo n.º 23
0
 def anticonsensus(self):
     sequence = ""
     for i in range(self.length):
         try:
             minimum = float("inf")
         except ValueError:
             # On Python 2.5 or older that was handled in C code,
             # and failed on Windows XP 32bit
             minimum = 1E400
         for letter in self.alphabet.letters:
             count = self[letter][i]
             if count < minimum:
                 minimum = count
                 sequence_letter = letter
         sequence += sequence_letter
     return Seq(sequence, self.alphabet)
Exemplo n.º 24
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(
            SeqRecord(Seq(mult_align_dict[i], alpha),
                      sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
Exemplo n.º 25
0
def read(handle):
    """read(handle)"""
    record = Record()
    line = next(handle)
    record.version = line.strip()
    line = next(handle)
    record.command = line.strip()
    for line in handle:
        line = line.strip()
        if line == "":
            pass
        elif line[:4] == "Para":
            record.parameters = {}
        elif line[0] == "#":
            seq_name = line.split("\t")[1]
            record.sequences.append(seq_name)
        elif "=" in line:
            par_name, par_value = line.split("=")
            par_name = par_name.strip()
            par_value = par_value.strip()
            record.parameters[par_name] = par_value
        elif line[:5] == "Input":
            record.sequences = []
        elif line[:5] == "Motif":
            words = line.split()
            assert words[0] == "Motif"
            number = int(words[1])
            instances = []
        elif line[:3] == "MAP":
            alphabet = IUPAC.unambiguous_dna
            instances = Instances(instances, alphabet)
            motif = Motif(alphabet, instances)
            motif.score = float(line.split()[-1])
            motif.number = number
            motif.mask = mask
            record.append(motif)
        elif len(line.split("\t")) == 4:
            seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna)
            instances.append(seq)
        elif "*" in line:
            mask = line.strip("\r\n")
        else:
            raise ValueError(line)
    return record
Exemplo n.º 26
0
    def degenerate_consensus(self):
        # Following the rules adapted from
        # D. R. Cavener: "Comparison of the consensus sequence flanking
        # translational start sites in Drosophila and vertebrates."
        # Nucleic Acids Research 15(4): 1353-1361. (1987).
        # The same rules are used by TRANSFAC.
        degenerate_nucleotide = {
            'A': 'A',
            'C': 'C',
            'G': 'G',
            'T': 'T',
            'AC': 'M',
            'AG': 'R',
            'AT': 'W',
            'CG': 'S',
            'CT': 'Y',
            'GT': 'K',
            'ACG': 'V',
            'ACT': 'H',
            'AGT': 'D',
            'CGT': 'B',
            'ACGT': 'N',
        }
        sequence = ""
        for i in range(self.length):

            def get(nucleotide):
                return self[nucleotide][i]

            nucleotides = sorted(self, key=get, reverse=True)
            counts = [self[c][i] for c in nucleotides]
            # Follow the Cavener rules:
            if counts[0] >= sum(counts[1:]) and counts[0] >= 2 * counts[1]:
                key = nucleotides[0]
            elif 4 * sum(counts[:2]) > 3 * sum(counts):
                key = "".join(sorted(nucleotides[:2]))
            elif counts[3] == 0:
                key = "".join(sorted(nucleotides[:3]))
            else:
                key = "ACGT"
            nucleotide = degenerate_nucleotide[key]
            sequence += nucleotide
        return Seq(sequence, alphabet=IUPAC.ambiguous_dna)
Exemplo n.º 27
0
    def _set_seq(self, seq, seq_type):
        """Checks the given sequence for attribute setting

        Arguments:
        seq -- String or SeqRecord to check
        seq_type -- String of sequence type, must be 'hit' or 'query'

        """
        assert seq_type in ('hit', 'query')
        if seq is None:
            return seq  # return immediately if seq is None
        else:
            if not isinstance(seq, (basestring, SeqRecord)):
                raise TypeError("%s sequence must be a string or a SeqRecord"
                                " object." % seq_type)
        # check length if the opposite sequence is not None
        opp_type = 'hit' if seq_type == 'query' else 'query'
        opp_seq = getattr(self, '_%s' % opp_type, None)
        if opp_seq is not None:
            if len(seq) != len(opp_seq):
                raise ValueError("Sequence lengths do not match. Expected: "
                                 "%r (%s); found: %r (%s)." %
                                 (len(opp_seq), opp_type, len(seq), seq_type))

        seq_id = getattr(self, '%s_id' % seq_type)
        seq_desc = getattr(self, '%s_description' % seq_type)
        seq_feats = getattr(self, '%s_features' % seq_type)
        seq_name = 'aligned %s sequence' % seq_type

        if isinstance(seq, SeqRecord):
            seq.id = seq_id
            seq.description = seq_desc
            seq.name = seq_name
            seq.features = seq_feats
            seq.seq.alphabet = self.alphabet
        elif isinstance(seq, basestring):
            seq = SeqRecord(Seq(seq, self.alphabet),
                            id=seq_id,
                            name=seq_name,
                            description=seq_desc,
                            features=seq_feats)

        return seq
Exemplo n.º 28
0
    def add_sequence(self, descriptor, sequence, start = None, end = None,
                     weight = 1.0):
        """Add a sequence to the alignment (DEPRECATED).

        The start, end, and weight arguments are not supported! This method
        only provides limited backwards compatibility with the old
        Bio.Align.Generic.Alignment object. Please use the append method with
        a SeqRecord instead, since add_sequence is likely to be removed in a
        future release of Biopython.
        """
        import warnings
        import Bio
        warnings.warn("The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
        #Should we handle start/end/strand information somehow? What for?
        #TODO - Should we handle weights somehow? See also AlignInfo code...
        if start is not None or end is not None or weight != 1.0:
            raise ValueError("The add_Sequence method is obsolete, and only "
                             "provides limited backwards compatibily. The"
                             "start, end and weight arguments are not "
                             "supported.")
        self.append(SeqRecord(Seq(sequence, self._alphabet),
                              id = descriptor, description = descriptor))
Exemplo n.º 29
0
    def _get_motif_dict(self, seq_records, motif_size):
        """Return a dictionary with information on motifs.

        This internal function essentially does all of the hard work for
        finding motifs, and returns a dictionary containing the found motifs
        and their counts. This is internal so it can be reused by
        find_motif_differences.
        """
        if self.alphabet_strict:
            alphabet = seq_records[0].seq.alphabet
        else:
            alphabet = None

        # loop through all records to find the motifs in the sequences
        all_motifs = {}
        for seq_record in seq_records:
            # if we are working with alphabets, make sure we are consistent
            if alphabet is not None:
                assert seq_record.seq.alphabet == alphabet, \
                       "Working with alphabet %s and got %s" % \
                       (alphabet, seq_record.seq.alphabet)

            # now start finding motifs in the sequence
            for start in range(len(seq_record.seq) - (motif_size - 1)):
                motif = str(seq_record.seq[start:start + motif_size])

                # if we are being alphabet strict, make sure the motif
                # falls within the specified alphabet
                if alphabet is not None:
                    motif_seq = Seq(motif, alphabet)
                    if _verify_alphabet(motif_seq):
                        all_motifs = self._add_motif(all_motifs, motif)

                # if we are not being strict, just add the motif
                else:
                    all_motifs = self._add_motif(all_motifs, motif)

        return all_motifs
Exemplo n.º 30
0
def molecular_weight(seq, seq_type=None, double_stranded=False, circular=False,
                     monoisotopic=False):
    """Calculates the molecular weight of a DNA, RNA or protein sequence.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

    seq: String or Biopython sequence object.
    seq_type: The default (None) is to take the alphabet from the seq argument,
              or assume DNA if the seq argument is a string. Override this with
              a string 'DNA', 'RNA', or 'protein'.
    double_stranded: Calculate the mass for the double stranded molecule?
    circular: Is the molecule circular (has no ends)?
    monoisotopic: Use the monoisotopic mass tables?

    Note that for backwards compatibility, if the seq argument is a string,
    or Seq object with a generic alphabet, and no seq_type is specified
    (i.e. left as None), then DNA is assumed.

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    Or, with the sequence alphabet:

    >>> from SAP.Bio.Seq import Seq
    >>> from SAP.Bio.Alphabet import generic_dna, generic_rna, generic_protein
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna)))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_rna)))
    997.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_protein)))
    249.29

    Also note that contradictory sequence alphabets and seq_type will also
    give an exception:

    >>> from SAP.Bio.Seq import Seq
    >>> from SAP.Bio.Alphabet import generic_dna
    >>> print("%0.2f" % molecular_weight(Seq("AGC", generic_dna), "RNA"))
    Traceback (most recent call last):
      ...
    ValueError: seq_type='RNA' contradicts DNA from seq alphabet

    """
    # Rewritten by Markus Piotrowski, 2014
    
    # Find the alphabet type
    tmp_type = ''
    if isinstance(seq, Seq) or isinstance(seq, MutableSeq):
        base_alphabet = Alphabet._get_base_alphabet(seq.alphabet)
        if isinstance(base_alphabet, Alphabet.DNAAlphabet):
            tmp_type = 'DNA'
        elif isinstance(base_alphabet, Alphabet.RNAAlphabet):
            tmp_type = 'RNA'
        elif isinstance(base_alphabet, Alphabet.ProteinAlphabet):
            tmp_type = 'protein'
        elif isinstance(base_alphabet, Alphabet.ThreeLetterProtein):
            tmp_type = 'protein'
            # Convert to one-letter sequence. Have to use a string for seq1  
            seq = Seq(seq1(str(seq)), alphabet=Alphabet.ProteinAlphabet())
        elif not isinstance(base_alphabet, Alphabet.Alphabet):
            raise TypeError("%s is not a valid alphabet for mass calculations"
                             % base_alphabet)
        else:
            tmp_type = "DNA" # backward compatibity
        if seq_type and tmp_type and tmp_type != seq_type:
            raise ValueError("seq_type=%r contradicts %s from seq alphabet"
                             % (seq_type, tmp_type))
        seq_type = tmp_type
    elif isinstance(seq, str):
        if seq_type is None:
            seq_type = "DNA" # backward compatibity
    else:
        raise TypeError("Expected a string or Seq object, not seq=%r" % seq)

    seq = ''.join(str(seq).split()).upper() # Do the minimum formatting

    if seq_type == 'DNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
        else:
            weight_table = IUPACData.unambiguous_dna_weights
    elif seq_type == 'RNA':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
        else:
            weight_table = IUPACData.unambiguous_rna_weights
    elif seq_type == 'protein':
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_protein_weights
        else:
            weight_table = IUPACData.protein_weights
    else:
        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r"
                         % seq_type)

    if monoisotopic:
        water = 18.010565
    else:
        water = 18.0153

    try:
        weight = sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    except KeyError as e:
        raise ValueError('%s is not a valid unambiguous letter for %s'
                         %(e, seq_type))
    except:
        raise

    if seq_type in ('DNA', 'RNA') and double_stranded:
        seq = str(Seq(seq).complement())
        weight += sum(weight_table[x] for x in seq) - (len(seq)-1) * water
        if circular:
            weight -= water
    elif seq_type == 'protein' and double_stranded:
        raise ValueError('double-stranded proteins await their discovery') 

    return weight
Exemplo n.º 31
0
    def to_seqrecord(self):
        """Create a SeqRecord object from this Sequence instance.

        The seqrecord.annotations dictionary is packed like so::

            { # Sequence attributes with no SeqRecord equivalent:
              'id_ref':     self.id_ref,
              'id_source':  self.id_source,
              'location':   self.location,
              'uri':        { 'value': self.uri.value,
                              'desc': self.uri.desc,
                              'type': self.uri.type },
              # Sequence.annotations attribute (list of Annotations)
              'annotations': [{ 'ref':      ann.ref,
                                'source':   ann.source,
                                'evidence': ann.evidence,
                                'type':     ann.type,
                                'confidence': [ ann.confidence.value,
                                                ann.confidence.type ],
                                'properties': [{ 'value': prop.value,
                                                 'ref': prop.ref,
                                                 'applies_to': prop.applies_to,
                                                 'datatype':   prop.datatype,
                                                 'unit':       prop.unit,
                                                 'id_ref':     prop.id_ref }
                                               for prop in ann.properties],
                              } for ann in self.annotations],
            }
        """
        def clean_dict(dct):
            """Remove None-valued items from a dictionary."""
            return dict(
                (key, val) for key, val in dct.items() if val is not None)

        seqrec = SeqRecord(
            Seq(self.mol_seq.value, self.get_alphabet()),
            **clean_dict({
                'id': str(self.accession),
                'name': self.symbol,
                'description': self.name,
                # 'dbxrefs': None,
            }))
        if self.domain_architecture:
            seqrec.features = [
                dom.to_seqfeature() for dom in self.domain_architecture.domains
            ]
        # Sequence attributes with no SeqRecord equivalent
        seqrec.annotations = clean_dict({
            'id_ref':
            self.id_ref,
            'id_source':
            self.id_source,
            'location':
            self.location,
            'uri':
            self.uri and clean_dict({
                'value': self.uri.value,
                'desc': self.uri.desc,
                'type': self.uri.type,
            }),
            'annotations':
            self.annotations and [
                clean_dict({
                    'ref':
                    ann.ref,
                    'source':
                    ann.source,
                    'evidence':
                    ann.evidence,
                    'type':
                    ann.type,
                    'confidence':
                    ann.confidence
                    and [ann.confidence.value, ann.confidence.type],
                    'properties': [
                        clean_dict({
                            'value': prop.value,
                            'ref': prop.ref,
                            'applies_to': prop.applies_to,
                            'datatype': prop.datatype,
                            'unit': prop.unit,
                            'id_ref': prop.id_ref
                        }) for prop in ann.properties
                    ],
                }) for ann in self.annotations
            ],
        })
        return seqrec
Exemplo n.º 32
0
    def __next__(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS']
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known CLUSTAL header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if line.split(None, 1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError(
                        "Identifiers out of order? Got '%s' but expected '%s'"
                        % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (
                        seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records, self.alphabet)
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
Exemplo n.º 33
0
class ProteinAnalysis(object):
    """Class containing methods for protein analysis.

    The constructor takes two arguments.
    The first is the protein sequence as a string, which is then converted to a
    sequence object using the Bio.Seq module. This is done just to make sure
    the sequence is a protein sequence and not anything else.

    The second argument is optional. If set to True, the weight of the amino
    acids will be calculated using their monoisotopic mass (the weight of the
    most abundant isotopes for each element), instead of the average molecular
    mass (the averaged weight of all stable isotopes for each element).
    If set to false (the default value) or left out, the IUPAC average
    molecular mass will be used for the calculation.

    """
    def __init__(self, prot_sequence, monoisotopic=False):
        if prot_sequence.islower():
            self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
        else:
            self.sequence = Seq(prot_sequence, IUPAC.protein)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        self.monoisotopic = monoisotopic

    def count_amino_acids(self):
        """Count standard amino acids, returns a dict.

        Counts the number times each amino acid is in the protein
        sequence. Returns a dictionary {AminoAcid:Number}.

        The return value is cached in self.amino_acids_content.
        It is not recalculated upon subsequent calls.
        """
        if self.amino_acids_content is None:
            prot_dic = dict((k, 0) for k in IUPACData.protein_letters)
            for aa in prot_dic:
                prot_dic[aa] = self.sequence.count(aa)

            self.amino_acids_content = prot_dic

        return self.amino_acids_content

    def get_amino_acids_percent(self):
        """Calculate the amino acid content in percentages.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary of {AminoAcid:percentage}.

        The return value is cached in self.amino_acids_percent.

        input is the dictionary self.amino_acids_content.
        output is a dictionary with amino acids as keys.
        """
        if self.amino_acids_percent is None:
            aa_counts = self.count_amino_acids()

            percentages = {}
            for aa in aa_counts:
                percentages[aa] = aa_counts[aa] / float(self.length)

            self.amino_acids_percent = percentages

        return self.amino_acids_percent

    def molecular_weight(self):
        """Calculate MW from Protein sequence"""
        # make local dictionary for speed
        if self.monoisotopic:
            water = 18.01
            iupac_weights = IUPACData.monoisotopic_protein_weights
        else:
            iupac_weights = IUPACData.protein_weights
            water = 18.02

        aa_weights = {}
        for i in iupac_weights:
            # remove a molecule of water from the amino acid weight
            aa_weights[i] = iupac_weights[i] - water

        total_weight = water  # add just one water molecule for the whole sequence
        for aa in self.sequence:
            total_weight += aa_weights[aa]

        return total_weight

    def aromaticity(self):
        """Calculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        """
        aromatic_aas = 'YWF'
        aa_percentages = self.get_amino_acids_percent()

        aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas)

        return aromaticity

    def instability_index(self):
        """Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life).

        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
        """
        index = ProtParamData.DIWV
        score = 0.0

        for i in range(self.length - 1):
            this, next = self.sequence[i:i+2]
            dipeptide_value = index[this][next]
            score += dipeptide_value

        return (10.0 / self.length) * score

    def flexibility(self):
        """Calculate the flexibility according to Vihinen, 1994.

        No argument to change window size because parameters are specific for a
        window=9. The parameters used are optimized for determining the flexibility.
        """
        flexibilities = ProtParamData.Flex
        window_size = 9
        weights = [0.25, 0.4375, 0.625, 0.8125, 1]
        scores = []

        for i in range(self.length - window_size):
            subsequence = self.sequence[i:i+window_size]
            score = 0.0

            for j in range(window_size // 2):
                front = subsequence[j]
                back = subsequence[window_size - j - 1]
                score += (flexibilities[front] + flexibilities[back]) * weights[j]

            middle = subsequence[window_size // 2 + 1]
            score += flexibilities[middle]

            scores.append(score / 5.25)

        return scores

    def gravy(self):
        """Calculate the gravy according to Kyte and Doolittle."""
        total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence)

        return total_gravy / self.length

    def _weight_list(self, window, edge):
        """Makes a list of relative weight of the
        window edges compared to the window center. The weights are linear.
        it actually generates half a list. For a window of size 9 and edge 0.4
        you get a list of [0.4, 0.55, 0.7, 0.85].
        """
        unit = 2 * (1.0 - edge) / (window - 1)
        weights = [0.0] * (window // 2)

        for i in range(window // 2):
            weights[i] = edge + unit * i

        return weights

    def protein_scale(self, param_dict, window, edge=1.0):
        """Compute a profile by any amino acid scale.

        An amino acid scale is defined by a numerical value assigned to each type of
        amino acid. The most frequently used scales are the hydrophobicity or
        hydrophilicity scales and the secondary structure conformational parameters
        scales, but many other scales exist which are based on different chemical and
        physical properties of the amino acids.  You can set several parameters that
        control the computation  of a scale profile, such as the window size and the
        window edge relative weight value.

        WindowSize: The window size is the length
        of the interval to use for the profile computation. For a window size n, we
        use the i-(n-1)/2 neighboring residues on each side to compute
        the score for residue i. The score for residue i is the sum of the scaled values
        for these amino acids, optionally weighted according to their position in the
        window.

        Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the same
        weight, but you can make the residue at the center of the window  have a
        larger weight than the others by setting the edge value for the  residues at
        the beginning and end of the interval to a value between 0 and 1. For
        instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
        1.0, 0.7, 0.4.

        The method returns a list of values which can be plotted to
        view the change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
        """
        # generate the weights
        #   _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
        #   what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
        #   in the loop.
        weights = self._weight_list(window, edge)
        scores = []

        # the score in each Window is divided by the sum of weights
        # (* 2 + 1) since the weight list is one sided:
        sum_of_weights = sum(weights) * 2 + 1

        for i in range(self.length - window + 1):
            subsequence = self.sequence[i:i+window]
            score = 0.0

            for j in range(window // 2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid
                try:
                    front = param_dict[subsequence[j]]
                    back = param_dict[subsequence[window - j - 1]]
                    score += weights[j] * front + weights[j] * back
                except KeyError:
                    sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
                             (subsequence[j], subsequence[window - j - 1]))

            # Now add the middle value, which always has a weight of 1.
            middle = subsequence[window // 2]
            if middle in param_dict:
                score += param_dict[middle]
            else:
                sys.stderr.write('warning: %s  is not a standard amino acid.\n' % (middle))

            scores.append(score / sum_of_weights)

        return scores

    def isoelectric_point(self):
        """Calculate the isoelectric point.

        Uses the module IsoelectricPoint to calculate the pI of a protein.
        """
        aa_content = self.count_amino_acids()

        ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
        return ie_point.pi()

    def secondary_structure_fraction(self):
        """Calculate fraction of helix, turn and sheet.

        Returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet.

        Amino acids in helix: V, I, Y, F, W, L.
        Amino acids in Turn: N, P, G, S.
        Amino acids in sheet: E, M, A, L.

        Returns a tuple of three integers (Helix, Turn, Sheet).
        """
        aa_percentages = self.get_amino_acids_percent()

        helix = sum(aa_percentages[r] for r in 'VIYFWL')
        turn  = sum(aa_percentages[r] for r in 'NPGS')
        sheet = sum(aa_percentages[r] for r in 'EMAL')

        return helix, turn, sheet