Exemplo n.º 1
0
def recode_sequence(sequence, converter, positions=None, code="Standard"):
    """uses the correspondence rules provided by the dictionary *converter*
    to produce a recoded version of *sequence*, and returns it.
    *positions* determines which codon positions are recoded.
    By default, all positions are recoded.
    """
    gm = ['p4.code_utils.recode_sequence()']
    if isinstance(code, str):
        code = getBiopythonCode(code)
    else:
        msg = "code must be a dictionary, or a string naming the code in Biopython."
        assert isinstance(code, dict), msg
    # To get the size of the motifs being substituted, we look at the first one in the dictionary.
    subst_size = len(converter.keys()[0])
    if len(sequence) % subst_size != 0:
        gm.append("The length of the sequence should be a multiple of %i" %
                  subst_size)
        raise P4Error(gm)
    if positions is not None:
        # Filter the converter.
        for codon in converter.keys():
            convert = converter[codon]
            # Replace the positions to be recoded by the converted codon, but keep the others.
            converter[codon] = CAT([
                (convert[i - 1] if i in positions else codon[i - 1])
                for i in range(1, subst_size + 1)
            ])
    # Build the recoded version of the sequence.
    new_seq = ""
    # Loop over the codons (triplets, if subst_size == 3).
    for i in range(len(sequence) // subst_size):
        try:
            # Make a Codon instance (to convert it afterwards).
            codon = Codon(sequence[(subst_size * i):(subst_size * (i + 1))],
                          code)
        except CodonTranslationError, e:
            sys.stderr.write("%s\nProblem at sequence slice %i:%i\n" %
                             (e, subst_size * i, subst_size * (i + 1)))
            warnings.warn("We will replace the codon by indels.\n")
            try:
                codon = Codon("-" * subst_size, code)
            except CodonTranslationError, e:
                sys.stderr.write(
                    "We still don't know how to translate the codon. "
                    "Bad implementation?\n")
                sys.exit(1)
def getDegenerateSitesMask(self,
                           transl_table=1,
                           code=None,
                           all_3rd_positions=False):
    """This method returns a mask corresponding to sites contributing to codon degeneracy.
    This is intended to be used for submatrix extraction using the noLRSall3 method,
    using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get
    the degeneracy-free sites).

    If *all_3rd_positions* is set to True, then the mask includes all 3rd codon positions
    regardless of their effective contribution to codon degeneracy.

    The matrix is expected to start at a first codon position and stop at a third
    codon position.

    *transl_table* is an integer used to determine under which genetic code
    the codons are to be interpreted. The default value of 1 corresponds to the
    standard genetic code. Other values can be found in p4.GeneticCode.py

    Alternatively, the genetic code can be provided directly, using a
    dictionnary *code* whose keys are codons, and the values are the
    corresponding amino-acids. All triplets present in the matrix should also
    be present in the code dictionnary, except triplets of indels. Codons and
    the corresponding amino-acids are expected to be in lower case.
    If such a code is provided, the value of *transl_table* is ignored.

    The name of this method noLRSall3 comes from its effect in the case of the
    standard genetic code: it discards the sites participating in first
    position degeneracy for leucine (L) and arginine (R), first and second
    position degeneracy for serine (S), as well as all third codon positions
    where degeneracy is observed (or all of them if *all_3rd_positions* is True).
    Depending on the genetic code used, the type of amino-acid affected could
    be different.

    The goal of the submatrix extraction using the produced mask is to remove
    the sites that could have been affected by composition bias: mutations
    within a set of synonymous codons are more likely to favour the codons that
    conform to the general nucleotide composition.  However, one could argue
    that this bias is less likely to have played when the observed codons
    differ by more than one nucleotide and at least a non-synonymous mutation
    has to occur to bridge the gap. With the standard genetic code, this occurs
    for serine codons.  Indeed, the minimal mutation paths connecting the
    serine AGY and TCN codon categories are
    AGY (serine) <-> TGY (cysteine) <-> TCY (serine)
    and
    AGY (serine) <-> ACY (threonine) <-> TCY (serine)

    The current implementation (as of june 2012) does not check that a
    mutational path between synonymous codons exists, that consists only in
    synonymous point mutations. This may be considered as a bug, because you
    may not want AGY and TCN (or other similar cases that could occur with
    different genetic codes) to be considered as a single degeneracy continuum.
    """

    gm = ["Alignment.getDegenerateSitesMask()"]

    if code is None:
        #code = GeneticCode(transl_table).code
        # Use the generalized Code class defined in code_utils.py
        code = Code(transl_table).code

    n_codons = self.length / 3
    mask = ""
    # Loop over the successive triplets of sites.
    for c in range(n_codons):
        # 3 alignment slices. One for each codon position.
        slices = [self.sequenceSlice((3 * c) + pos - 1) for pos in [1, 2, 3]]
        # The different codons found for the current triplet of sites.
        codons = set([
            codon.lower() for codon in
            ["%s%s%s" % nnn for nnn in zip(slices[0], slices[1], slices[2])]
        ])
        # These are not Codon instances, this probably doesn't deal properly with ambiguity codes.
        # Record the amino-acids coded at the 3 nucleotides site, and the codons used for this aa.
        aas_codons = {}
        for codon in codons:
            # Determine the corresponding amino-acid.
            if codon == '---':
                aa = '-'
            elif codon in code:
                aa = code[codon]
            elif 'n' in codon:
                # This is a simplification. Some "degenerate" codons
                # can still code an unambiguous amino-acid.
                aa = 'x'
            else:
                gm.append("Codon %s is not defined in the chosen code "
                          "or translation table." % codon)
                gm.append("%s" % str(code))
                raise P4Error(gm)
            # Record the codon used for the aa.
            if aa in aas_codons:
                aas_codons[aa].append(codon)
            else:
                aas_codons[aa] = [codon]
        # Determine which positions in the triplet are degenerate.
        codon_mask = [False, False, False]
        # Loop over the recorded amino-acids.
        for aa in aas_codons.keys():
            if len(aas_codons[aa]) > 1:
                # Several codons have been found at this triplet for the amino-acid aa.
                # For each position, count the number of different nucleotides
                # present in the used codons.
                degeneracy = [
                    len(set([cod[0] for cod in aas_codons[aa]])),
                    len(set([cod[1] for cod in aas_codons[aa]])),
                    len(set([cod[2] for cod in aas_codons[aa]]))
                ]
                if all_3rd_positions:
                    # Put a position in the mask if it is already in the mask
                    # or if it is degenerate, or if it is a 3rd position.
                    codon_mask = [
                        codon_mask[pos - 1] or (degeneracy[pos - 1] > 1)
                        for pos in [1, 2]
                    ] + [True]
                else:
                    # Put a position in the mask if it is already in the mask
                    # or if it is degenerate.
                    codon_mask = [
                        codon_mask[pos - 1] or (degeneracy[pos - 1] > 1)
                        for pos in [1, 2, 3]
                    ]
            if all(codon_mask):
                # All positions of the triplet have been found to contribute to
                # some codon degeneracy somewhere in the alignment.
                # There is no need to search further.
                break
        # Append the codon mask to the mask.
        mask += CAT(map(lambda b: "1" if b else "0", codon_mask))
    return mask
def treeFinderMAPAnalysis(alignment,
                          groups,
                          gamma=True,
                          invariant=True,
                          bootstrap=False,
                          nreplicates=100,
                          remove_files=False,
                          run_analysis=True,
                          verbose=False):
    """
    Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP
    substitution model for grouped amino-acids.

    - *alignment*: p4 alignment object of original (un-recoded) protein data from
      which the "groups" are derived
    - *groups*: list of grouped amino-acids, possibly resuling from
      :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()`
    - *gamma*: include gamma distribution of among-site rate variation
    - *bootstrap*: run bootstrap analysis
    - *nreplicates*: number of bootstrap replicates
    - *invariant*: include a proportion of invariant sites
    - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the
      control file
    - *remove_files*: remove analysis files. Only available if run_analysis=True

    """

    gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"]

    if not isinstance(alignment, Alignment):
        msg = "alignment must be a Alignment object"
        gm.append(msg)
        raise P4Error(gm)

    if alignment.dataType != "protein":
        msg = "alignment should be the original protein data from" + \
              "which the groups were defined. Doing nothing."
        gm.append(msg)
        raise P4Error(gm)

    for param in [
            gamma, invariant, bootstrap, remove_files, run_analysis, verbose
    ]:
        if not isinstance(param, bool):
            msg = "%s value must be either True or False" % param
            gm.append(msg)
            raise P4Error(gm)

    if not isinstance(nreplicates, int):
        msg = "nreplictes must be an integer"
        gm.append(msg)
        raise P4Error(gm)

    if run_analysis:
        if not func.which2("tf"):
            msg = "tf (treefinder) is not in your $PATH" + \
                  "Cannot run analysis"
            gm.append(msg)
            raise P4Error(gm)

    datafile_name = "tf_data.phy"

    #tf commands
    tls = """ReconstructPhylogeny[
             "%(datafile)s",
             SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s,
             WithEdgeSupport->%(bootstrap)s%(nreplicates)s
             ],
             "%(outfile)s",SaveReport"""
    od = {}
    od["datafile"] = datafile_name
    if gamma:
        if invariant:
            od["ifH"] = ":GI[Optimum]"
        else:
            od["ifH"] = ":G[Optimum]"
    else:
        if invariant:
            od["ifH"] = ":I[Optimum]"
        else:
            od["ifH"] = ""
    if bootstrap:
        od["bootstrap"] = "True"
        od["nreplicates"] = ",NReplicates->%i" % nreplicates
    else:
        od["bootstrap"] = "False"
        od["nreplicates"] = ""
    od["outfile"] = "tf_reconstruction.output"
    od["map"] = ",".join(
        ['"%s"' % i for i in [group.upper() for group in groups]])

    if run_analysis:

        #Write data file
        alignment.writePhylip(datafile_name)

        #Write control file
        tl_file = "tf_control.tl"
        fh = open(tl_file, "w")
        fh.write(tls % od)
        fh.close()

        if verbose:
            direct = subprocess.STDOUT
        else:
            direct = open("/dev/null", "w")

        child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True)

        if verbose:
            print("Running TreeFinder, this could take some time...", end=' ')
            sys.stdout.flush()

        child.communicate()

        if verbose:
            print("done.")
            sys.stdout.flush()

        #This doesnt seem to work, why?
        #while child.poll() is None:
        #    time.sleep(60)
        #    if verbose:
        #        sys.stdout.write(".")
        #        sys.stdout.flush()

        if child.returncode != 0:
            msg = "TreeFinder returned error code %s"
            gm.append(msg % (child.returncode))
            raise P4Error(gm)

        fh = open(od["outfile"], "r")
        line = fh.readlines()[1]
        fh.close()

        rd = {}
        #Likelihood
        rd["Likelihood"] = float(line[line.index("Likelihood->") +
                                      12:line.index(",")])
        #Tree
        ts = line[line.index("Phylogeny->") +
                  11:line.index("SubstitutionModel->") - 1]
        rd["Phylogeny"] = ts
        #SubstitutionModel
        sm = line[line.index("SubstitutionModel->") +
                  19:line.index("OSubstitutionModel->") - 1]
        rd["SubstitutionModel"] = sm
        #OSubstitutionModel
        osm = line[line.index("OSubstitutionModel->") +
                   20:line.index("OEdgeOptimizationOff->") - 1]
        rd["OSubstitutionModel"] = osm
        #NSites
        ns = line[line.index("NSites->") + 8:line.index("NParameters->") - 1]
        rd["Nsites"] = int(ns)
        #NParameters
        np = line[line.index("NParameters->") + 13:line.index("AIC->") - 1]
        rd["NParameters"] = int(np)
        #AIC
        rd["AIC"] = float(line[line.index("AIC->") + 5:line.index("AICc->") -
                               1])
        #AICc->
        rd["AICc"] = float(line[line.index("AICc->") + 6:line.index("HQ->") -
                                1])
        #HQ
        rd["HQ"] = float(line[line.index("HQ->") + 4:line.index("BIC->") - 1])
        #BIC
        rd["BIC"] = float(line[line.index("BIC->") +
                               5:line.index("Checksum->") - 1])
        #LikelihoodTime
        lt = line[line.index("LikelihoodTime->") +
                  16:line.index("LikelihoodMemory->") - 1]
        rd["LikelihoodTime"] = float(lt)
        #LikelihoodMemory
        lm = line[line.index("LikelihoodMemory->") + 18:-3]
        rd["LikelihoodMemory"] = int(lm)

        #Make a tree object
        tree = rd["Phylogeny"].replace("{", "(")
        tree = tree.replace("}", ")")
        tree = tree.replace("\"", "")
        tree = tree + ";"
        if bootstrap:
            #Tree viewer has the brlen before bootstrap value plus an extra colon
            # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy"
            patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)")
            repl = r")\2:\1"
            tree = re.sub(patt, repl, tree)
        origw = var.warnReadNoFile
        var.warnReadNoFile = False
        read(tree)
        var.warnReadNoFile = origw
        result_tree = var.trees.pop()
        if bootstrap:
            #Round up floats to percentages
            for node in result_tree.iterInternalsNoRoot():
                node.name = "%2.f" % float(node.name)

        if remove_files:
            os.remove("tf_control.tl")
            os.remove("tf_data.phy")
            os.remove("tf_reconstruction.output")

        if verbose:
            print("\n")
            result_tree.draw()
            print("\nLikelihood: %.4f\n" % rd["Likelihood"])

        return result_tree, rd

    else:
        print(tls % od)
        return (None, None)
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None):
    """Returns a pseudo protein alignment from *self*, a DNA alignment.
    The result is of datatype standard instead of protein, which allows
    the use of special recodings, like distinguishing between two types
    of serines, like in :meth:`Alignment.recode23aa()`.

    *self* is translated using :attribute:`Code(transl_table).code`.

    Alternatively, the genetic code can be provided through the parameter *code*.
    If such a code is provided, the value of *transl_table* is ignored.
    The parameter *code* can take to types of values:
    1) It can be a string naming the code to use, as defined in Biopython's
    `CodonTable.unambiguous_dna_by_name.keys()`
    2) It can be a dictionnary *code* whose keys are codons, and the values are
    the corresponding amino-acids. All triplets present in the matrix should
    also be present in the code dictionnary, except triplets of indels. Codons
    and the corresponding amino-acids are expected to be in lower case.
    It may be possible to use a code based on another codon length as 3,
    but this has not been tested as of June 2012.


    At the moment, we can only do translations where the sequences are phased
    with the coding frame, ie the first sequence position is the first position
    of the codon, and the last sequence position should be a last codon position.

    The default behaviour is to use translation table 1, that is the standard genetic code.
    Other available translation tables, this week::

        if transl_table == 1: # standard
        elif transl_table == 2: # vertebrate mito
        elif transl_table == 4: # Mold, Protozoan,
                                # and Coelenterate Mitochondrial Code
                                # and the Mycoplasma/Spiroplasma Code
        elif transl_table == 5: # invertebrate mito
        elif transl_table == 9: # echinoderm mito

        and now 6, 10, 11, 12, 13, 14, 21.

    (These are found in p4.GeneticCode.py or in :class:`Code`)

    *transl_table* may also be provided as text consisting in blank-separated elements.
    Each elements consists in n characters, where n is the number of defined codons.
    The first element lists the coded (pseudo-)amino-acids.
    The second elements describes whether a codon can be a start codon ('M') or not ('-').
    The other elements correspond to the (pseudo-)nucleotides at the successive codon positions.
    Example::
        FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG
        ---M---------------M------------MMMM---------------M------------
        TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
        TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
        TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG

    """

    gm = ['p4.alignment_recoding.pseudoTranslate()']
    if self.dataType != 'dna':
        gm.append("Self should be a DNA alignment")
        raise P4Error(gm)

    if code is None:
        #from GeneticCode import Code
        code = Code(transl_table, in_type="dna", out_type=out_type).code
        codelength = Code(transl_table).codelength
    else:
        if isinstance(code, str):
            code = getBiopythonCode(code)  # defined in code_utils.py
        # We assume that the "codons" have all the same length,
        # and we look at the first codon in the dictionary to know this length.
        codelength = len(code.keys()[0])
        # We use standard type, because, depending on the code used to make the translation,
        # we may get something that contains symbols not corresponding to normal amino-acids.
        out_type = "standard"

    if self.length % codelength != 0:
        gm.append("The length of self should be a multiple of %i" % codelength)
        raise P4Error(gm)

    ali = self.dupe()
    ali.dataType = out_type
    ali.length = self.length / codelength
    ali.symbols = CAT(sorted(set(code.values())))
    ali.equates = {}
    ali.dim = len(ali.symbols)
    ali.nexusSets = None
    ali.parts = []
    ali.excludeDelete = None
    for seq in ali.sequences:
        # Initialize an all-gap sequence.
        seq.sequence = ['-'] * ali.length
        seq.dataType = out_type

    for i in range(len(self.sequences)):
        # the original sequence
        dnaSeq = self.sequences[i].sequence
        # the future pseudo-translation
        pseudoProtSeq = ali.sequences[i].sequence
        for j in range(ali.length):
            theCodon = dnaSeq[(j * codelength):((j + 1) * codelength)]
            if theCodon in code:
                pseudoProtSeq[j] = code[theCodon]
            elif theCodon == '-' * codelength:
                # full indel
                pseudoProtSeq[j] = '-'
            elif theCodon.count('-'):
                # partial indel
                gm.append(
                    "    seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete"
                    % (i, j, (j * codelength), theCodon))
                raise P4Error(gm)
            else:
                # Should we use a CodonTranslationError (defined in code_utils.py) here ?
                gm.append(
                    "    seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon"
                    % (i, j, (j * codelength), theCodon))
                raise P4Error(gm)

    for seq in ali.sequences:
        # Convert from list to string.
        #s.sequence = ''.join(s.sequence)
        seq.sequence = CAT(seq.sequence)
        #print s.sequence
    return ali