Exemplo n.º 1
0
def get_all_possible_binary(mutations,encoding_table,skip_sanity_checks=False):
    """
    Get the complete set of binary genotypes possible given a mutations list
    and an encoding table. There is no particular order to the genotypes.
    Consider sorting.

    mutations : list
        Mutation list encoding possible states at each site. If this is
        not specified, this will be generated automatically based on the states
        seen in the list of genotypes.
    encoding_table : pandas.DataFrame
        DataFrame that encodes the binary representation of each mutation in
        the list of genotypes. (See the `get_encoding_table` function).
    skip_sanity_checks : bool
        whether or not to check sanity of inputs (default: False)

    Returns
    -------
    all_possible_binary : list
        list of all possible binary genotypes given mutations list and
        encoding_table
    """

    if not skip_sanity_checks:
        check.mutations_sanity(mutations)

    all_possible = mutations_to_genotypes(mutations,skip_sanity_checks=True)
    all_possible_binary = genotypes_to_binary(all_possible,
                                              encoding_table,
                                              skip_sanity_checks=True)

    return all_possible_binary
Exemplo n.º 2
0
def mutations_to_genotypes(mutations,skip_sanity_checks=False):
    """
    Use a mutations list to construct an array of genotypes composed
    of the mutations.

    Parameters
    ----------
    mutations : list
        List of lists containing allowable mutations at each site.
    skip_sanity_checks : bool
        whether or not to check sanity of inputs (default: false)

    Returns
    -------
    genotype : list
        list of genotypes comprised of all combinations of mutations in
        mutations list
    """

    if not skip_sanity_checks:
        check.mutations_sanity(mutations)

    sequences = itertools.product(*mutations)
    genotype = ["".join(s) for s in sequences]

    return genotype
Exemplo n.º 3
0
def genotype_is_in(genotypes,mutations,skip_sanity_checks=False):
    """
    Determine whether genotypes are within the volume covered by mutations list.

    Parameters
    ----------
    genotypes : str or list of str
        genotype(s) to check.
    mutations : list
        List of lists containing allowable mutations at each site.
    skip_sanity_checks : bool
        whether or not to check sanity of inputs (default: false)

    Returns
    -------
    result : bool or list of bool
        whether genotypes are contained within the GenotypePhenotypeMap
    """

    # If a single genotype, convert to a list
    if type(genotypes) is str:
        genotypes = [genotypes]

    # Check sanity of mutations
    if not skip_sanity_checks:
        check.mutations_sanity(mutations)

    # Default to not in
    out = [False for _ in range(len(genotypes))]
    for i, g in enumerate(genotypes):

        try:

            # if genotype wrong length, not in
            if len(g) != len(mutations):
                continue

            # If genotype has state not in mutations, it's not in
            good = True
            for j in range(len(mutations)):
                if g[j] not in mutations[j]:
                    good = False
                    break

            out[i] = good

        except TypeError:
            err = f"Problem with genotype '{g}'\n"
            err += "genotype must be a single str (one genotype) or list of \n"
            err += "strings (multiple genotypes)\n"
            raise TypeError(err)

    return out
Exemplo n.º 4
0
def get_missing_genotypes(genotype,mutations=None,skip_sanity_checks=False):
    """
    Get a list of genotypes not found in the given genotype list.

    Parameters
    ----------

    genotype : list
        List of genotypes.

    mutations : list
        Mutation list encoding possible states at each site. If this is
        not specified, this will be generated automatically based on the states
        seen in genotype.

    skip_sanity_checks : bool
        whether or not to check sanity of inputs (default: false)

    Returns
    -------
    missing_genotype : list
        list of all possible genotypes not found in genotypes list.
    """

    # Check input sanity
    if not skip_sanity_checks:
        check.genotype_sanity(genotype)

        if mutations is not None:
            check.mutations_sanity(mutations,genotype=genotype)

    # Construct a list of mutations. Skip sanity check -- we already validated
    # or skipped.
    if mutations is None:
        mutations = genotypes_to_mutations(genotype,
                                           skip_sanity_checks=True)

    # Get all genotypes.
    all_genotype = mutations_to_genotypes(mutations,skip_sanity_checks=True)

    # Find genotypes not found in genotypes list.
    missing_genotype = set(all_genotype) - set(genotype)

    return list(missing_genotype)
Exemplo n.º 5
0
    def mutations(self,mutations):
        """
        Set the mutations. This triggers reconstruction of binary etc.

        Parameters
        ----------
        mutations : list
            List of list mapping sites to possible states at each site.
        """
        if mutations is None:
            mutations = utils.genotypes_to_mutations(self._data.loc[:,"genotype"],
                                                     wildtype=self._wildtype)
        check.mutations_sanity(mutations,self._wildtype,self._data.loc[:,"genotype"])

        is_different = False
        for i in range(len(mutations)):
            if set(mutations[i]) != set(self._mutations[i]):
                is_different = True
                break

        if is_different:
            self._mutations = mutations
            self._rebuild_map()
Exemplo n.º 6
0
def get_all_possible_genotypes(mutations,skip_sanity_checks=False):
    """
    Get the complete set of genotypes possible given a mutations list. There is
    no particular order to the genotypes. Consider sorting.

    Parameters
    ----------
    mutations : list
        Mutation list encoding possible states at each site.

    skip_sanity_checks : bool
        whether or not to check sanity of inputs (default: false)

    Returns
    -------
    genotype : list
        list of all genotypes possible given the mutations list
    """

    if not skip_sanity_checks:
        check.mutations_sanity(mutations)

    # Get all genotype.
    return mutations_to_genotypes(mutations,skip_sanity_checks=True)
Exemplo n.º 7
0
    def __init__(self,
                 genotype=[],
                 wildtype=None,
                 mutations=None,
                 site_labels=None,
                 **kwargs):

        # If no wildtype is specified, use the first genotype as the wildtype
        # sequence.
        if wildtype is None:
            if len(genotype) > 0:
                wildtype = genotype[0]
            else:
                err = "You must specify at least the wildtype sequence or a \n"
                err += "a genotype array with at least one entry\n"
                raise ValueError(err)

        # Check sanity of wildtype
        check.wildtype_sanity(wildtype)

        # Check genotypes sanity
        check.genotype_sanity(genotype,wildtype=wildtype)

        # Construct mutations list if not specified
        if mutations is None:
            mutations = utils.genotypes_to_mutations(genotype,
                                                     wildtype=wildtype)

        # Check mutations sanity
        check.mutations_sanity(mutations,
                               wildtype=wildtype,
                               genotype=genotype)

        # Construct site_labels list if not specified
        if site_labels is None:
            site_labels = list(range(len(wildtype)))

        # Check site_labels sanity
        check.site_labels_sanity(site_labels,wildtype=wildtype)

        # Start loading in the data from above
        self._wildtype = wildtype
        self._mutations = mutations
        self._site_labels = site_labels

        # Construct the master data frame for the gpmap.
        self._data = pd.DataFrame({"genotype":genotype})

        # --------------------------------------------------------------------
        # kwargs; interpreted as columns in the dataframe

        for k in kwargs:

            # Make sure the keyword is not already an attribute of self
            try:
                self.__getattribute__(k)
                err = f"keyword '{k}' is a reserved name. Please choose a\n"
                err += "different name for this column.\n"
                raise ValueError(err)
            except AttributeError:
                pass

            # Make sure we're not going to collide with a private name we
            # cannot guarantee won't exist in the future...
            if k[0] in ["_","."]:
                err = "data keywords cannot start with '_' or '.'\n"
                raise ValueError(err)

            # Make sure the kwarg has the same length as the genotypes
            try:
                if len(kwargs[k]) != len(genotype):
                    raise TypeError
            except TypeError:
                err = "keyword arguments must specify data as an array-like\n"
                err += "data structure with the same length as the genotype\n"
                err += "array.\n"
                raise ValueError(err)

            # Passed quality control, load into main data frame
            self._data.loc[:,k] = kwargs[k]

        # Set all rows to "include" = True
        self._data.loc[:,"include"] = np.ones(len(genotype),dtype=bool)

        # Make set of current genotypes
        self._current_genotype = self._data.loc[:,"genotype"].copy()

        # Build encoding table and binary representation of map
        self._rebuild_map()
Exemplo n.º 8
0
def test_check_mutations_sanity():
    """
    Test mutations sanity checker.
    """

    # Not a list of lists
    with pytest.raises(ValueError):
        check.mutations_sanity([5,5])

    # Dict, not indexable by 0
    with pytest.raises(ValueError):
        check.mutations_sanity({"1":["A"]})

    # Should fail because it's a dict, even if indexable properly
    with pytest.raises(ValueError):
        check.mutations_sanity({0:["A"],1:["B"]})

    # Empty list value
    with pytest.raises(ValueError):
        check.mutations_sanity([[],["A"]])

    # mismatch with wildtype
    with pytest.raises(ValueError):
        check.mutations_sanity([["A","B"]],wildtype="AA")

    # should not throw error
    check.mutations_sanity([["A","B"]])
    check.mutations_sanity([["A","B"]],wildtype="A")

    # Die because genotype has character ("C") that is not in the mutations
    with pytest.raises(ValueError):
        check.mutations_sanity([["A","B"]],wildtype="A",genotype=["C"])

    # Die because wildtype has character ("C") that is not in the mutations
    with pytest.raises(ValueError):
        check.mutations_sanity([["A","B"]],wildtype="C",genotype=["A"])

    # This should work
    check.mutations_sanity([["A","B"]],wildtype="A",genotype=["B"])
Exemplo n.º 9
0
def get_encoding_table(wildtype, mutations, site_labels=None):
    """
    This function constructs a lookup table (pandas.DataFrame) for mutations
    in a given mutations list. This table encodes mutations with a binary
    representation.

    Parameters
    ----------
    wildtype : str
        string with wildtype sequence
    mutations : list
        mutation list of lists containing possible states at each site.
        (Must be same length as wt sequence).
    site_labels : list or None
        list of labels for each site. If None, use sequential integers starting
        at 0. If not none, list must have same length as wildtype, all entires
        must be unique.

    Returns
    -------
    df : pandas.DataFrame
        dataframe holding various representations of the gp map.

    Notes
    -----
    The table has the following columns:
    + genotype_index : this corresponds to the number counting across the
                       genotype (starting at zero and going to length - 1)
    + wildtype_letter : character for wildtype at this position
    + mutation_letter : character for mutation at this position (for wildtype
                        row, this will be same as wildtype_letter)
    + binary_repr : binary representation of this site. If there are two
                    characters at the genotype site (wildtype and mutant) this
                    will be "0" for wildtype and "1" for mutant. If there are
                    three characters (wildtype, mutant1, mutant2), this will be
                    "00", "10", "01" for wildtype, mutant1, and mutant2 rows
                    respectively. For wildtype and three mutants, this will be
                    "000", "100","010","001". This binary representation allows
                    this genotype phenotype map to be fed directly into the
                    epistasis package, which assumes each site is binary.
    binary_index_start : start of region of binary string corresponding to this
                         gene site.
    binary_index_stop : end of region of binary string corresponding to this
                        gene site.
    mutation_index : counter that gives each amino acid state a unique number.
    site_label : label for site. Used for constructing mutant names. For example,
                 for the mutant A73G, A would be the wildtype_letter, G would
                 be the mutation_letter, and 73 would be the site label. If not
                 specified by the user, the site label is a number counting from
                 zero. Otherwise, it is the user-specified label.
    """

    check.wildtype_sanity(wildtype)

    # Check mutations input
    check.mutations_sanity(mutations,wildtype=wildtype)

    # Make fake site labels if not specified
    if site_labels is None:
        site_labels = list(range(len(wildtype)))

    check.site_labels_sanity(site_labels,wildtype)

    # Initialize table
    table = []
    mutation_index_counter = 1
    binary_index_counter = 0
    for genotype_index, alphabet in enumerate(mutations):

        # Set genotype_index as int
        genotype_index = int(genotype_index)

        # Create local alphabet copy where wildtype state is the first element
        # regardless of its position in the alphabet.
        local_alphabet = alphabet[:]
        if local_alphabet[0] != wildtype[genotype_index]:
            local_alphabet.remove(wildtype[genotype_index])
            local_alphabet.insert(0,wildtype[genotype_index])

        binary_size = len(local_alphabet) - 1
        for i, a in enumerate(local_alphabet):

            tmp_binary_repr = ["0" for _ in range(binary_size)]
            if i == 0:
                mutation_index = None
            else:
                mutation_index = mutation_index_counter
                mutation_index_counter += 1
                tmp_binary_repr[i-1] = "1"

            binary_repr = "".join(tmp_binary_repr)

            table.append(
                dict(
                    genotype_index=genotype_index,
                    wildtype_letter=wildtype[genotype_index],
                    mutation_letter=a,
                    binary_repr=binary_repr,
                    binary_index_start=binary_index_counter,
                    binary_index_stop=binary_index_counter + binary_size,
                    mutation_index=mutation_index,
                    site_label=site_labels[genotype_index]
                )
            )

        binary_index_counter += binary_size

    # Turn table into DataFrame.
    df = pd.DataFrame(table)
    df.genotype_index = df.genotype_index.astype('Int64')
    df.mutation_index = df.mutation_index.astype('Int64')
    df.binary_index_start = df.binary_index_start.astype('Int64')
    df.binary_index_stop = df.binary_index_stop.astype('Int64')

    return df