def get_all_possible_binary(mutations,encoding_table,skip_sanity_checks=False): """ Get the complete set of binary genotypes possible given a mutations list and an encoding table. There is no particular order to the genotypes. Consider sorting. mutations : list Mutation list encoding possible states at each site. If this is not specified, this will be generated automatically based on the states seen in the list of genotypes. encoding_table : pandas.DataFrame DataFrame that encodes the binary representation of each mutation in the list of genotypes. (See the `get_encoding_table` function). skip_sanity_checks : bool whether or not to check sanity of inputs (default: False) Returns ------- all_possible_binary : list list of all possible binary genotypes given mutations list and encoding_table """ if not skip_sanity_checks: check.mutations_sanity(mutations) all_possible = mutations_to_genotypes(mutations,skip_sanity_checks=True) all_possible_binary = genotypes_to_binary(all_possible, encoding_table, skip_sanity_checks=True) return all_possible_binary
def mutations_to_genotypes(mutations,skip_sanity_checks=False): """ Use a mutations list to construct an array of genotypes composed of the mutations. Parameters ---------- mutations : list List of lists containing allowable mutations at each site. skip_sanity_checks : bool whether or not to check sanity of inputs (default: false) Returns ------- genotype : list list of genotypes comprised of all combinations of mutations in mutations list """ if not skip_sanity_checks: check.mutations_sanity(mutations) sequences = itertools.product(*mutations) genotype = ["".join(s) for s in sequences] return genotype
def genotype_is_in(genotypes,mutations,skip_sanity_checks=False): """ Determine whether genotypes are within the volume covered by mutations list. Parameters ---------- genotypes : str or list of str genotype(s) to check. mutations : list List of lists containing allowable mutations at each site. skip_sanity_checks : bool whether or not to check sanity of inputs (default: false) Returns ------- result : bool or list of bool whether genotypes are contained within the GenotypePhenotypeMap """ # If a single genotype, convert to a list if type(genotypes) is str: genotypes = [genotypes] # Check sanity of mutations if not skip_sanity_checks: check.mutations_sanity(mutations) # Default to not in out = [False for _ in range(len(genotypes))] for i, g in enumerate(genotypes): try: # if genotype wrong length, not in if len(g) != len(mutations): continue # If genotype has state not in mutations, it's not in good = True for j in range(len(mutations)): if g[j] not in mutations[j]: good = False break out[i] = good except TypeError: err = f"Problem with genotype '{g}'\n" err += "genotype must be a single str (one genotype) or list of \n" err += "strings (multiple genotypes)\n" raise TypeError(err) return out
def get_missing_genotypes(genotype,mutations=None,skip_sanity_checks=False): """ Get a list of genotypes not found in the given genotype list. Parameters ---------- genotype : list List of genotypes. mutations : list Mutation list encoding possible states at each site. If this is not specified, this will be generated automatically based on the states seen in genotype. skip_sanity_checks : bool whether or not to check sanity of inputs (default: false) Returns ------- missing_genotype : list list of all possible genotypes not found in genotypes list. """ # Check input sanity if not skip_sanity_checks: check.genotype_sanity(genotype) if mutations is not None: check.mutations_sanity(mutations,genotype=genotype) # Construct a list of mutations. Skip sanity check -- we already validated # or skipped. if mutations is None: mutations = genotypes_to_mutations(genotype, skip_sanity_checks=True) # Get all genotypes. all_genotype = mutations_to_genotypes(mutations,skip_sanity_checks=True) # Find genotypes not found in genotypes list. missing_genotype = set(all_genotype) - set(genotype) return list(missing_genotype)
def mutations(self,mutations): """ Set the mutations. This triggers reconstruction of binary etc. Parameters ---------- mutations : list List of list mapping sites to possible states at each site. """ if mutations is None: mutations = utils.genotypes_to_mutations(self._data.loc[:,"genotype"], wildtype=self._wildtype) check.mutations_sanity(mutations,self._wildtype,self._data.loc[:,"genotype"]) is_different = False for i in range(len(mutations)): if set(mutations[i]) != set(self._mutations[i]): is_different = True break if is_different: self._mutations = mutations self._rebuild_map()
def get_all_possible_genotypes(mutations,skip_sanity_checks=False): """ Get the complete set of genotypes possible given a mutations list. There is no particular order to the genotypes. Consider sorting. Parameters ---------- mutations : list Mutation list encoding possible states at each site. skip_sanity_checks : bool whether or not to check sanity of inputs (default: false) Returns ------- genotype : list list of all genotypes possible given the mutations list """ if not skip_sanity_checks: check.mutations_sanity(mutations) # Get all genotype. return mutations_to_genotypes(mutations,skip_sanity_checks=True)
def __init__(self, genotype=[], wildtype=None, mutations=None, site_labels=None, **kwargs): # If no wildtype is specified, use the first genotype as the wildtype # sequence. if wildtype is None: if len(genotype) > 0: wildtype = genotype[0] else: err = "You must specify at least the wildtype sequence or a \n" err += "a genotype array with at least one entry\n" raise ValueError(err) # Check sanity of wildtype check.wildtype_sanity(wildtype) # Check genotypes sanity check.genotype_sanity(genotype,wildtype=wildtype) # Construct mutations list if not specified if mutations is None: mutations = utils.genotypes_to_mutations(genotype, wildtype=wildtype) # Check mutations sanity check.mutations_sanity(mutations, wildtype=wildtype, genotype=genotype) # Construct site_labels list if not specified if site_labels is None: site_labels = list(range(len(wildtype))) # Check site_labels sanity check.site_labels_sanity(site_labels,wildtype=wildtype) # Start loading in the data from above self._wildtype = wildtype self._mutations = mutations self._site_labels = site_labels # Construct the master data frame for the gpmap. self._data = pd.DataFrame({"genotype":genotype}) # -------------------------------------------------------------------- # kwargs; interpreted as columns in the dataframe for k in kwargs: # Make sure the keyword is not already an attribute of self try: self.__getattribute__(k) err = f"keyword '{k}' is a reserved name. Please choose a\n" err += "different name for this column.\n" raise ValueError(err) except AttributeError: pass # Make sure we're not going to collide with a private name we # cannot guarantee won't exist in the future... if k[0] in ["_","."]: err = "data keywords cannot start with '_' or '.'\n" raise ValueError(err) # Make sure the kwarg has the same length as the genotypes try: if len(kwargs[k]) != len(genotype): raise TypeError except TypeError: err = "keyword arguments must specify data as an array-like\n" err += "data structure with the same length as the genotype\n" err += "array.\n" raise ValueError(err) # Passed quality control, load into main data frame self._data.loc[:,k] = kwargs[k] # Set all rows to "include" = True self._data.loc[:,"include"] = np.ones(len(genotype),dtype=bool) # Make set of current genotypes self._current_genotype = self._data.loc[:,"genotype"].copy() # Build encoding table and binary representation of map self._rebuild_map()
def test_check_mutations_sanity(): """ Test mutations sanity checker. """ # Not a list of lists with pytest.raises(ValueError): check.mutations_sanity([5,5]) # Dict, not indexable by 0 with pytest.raises(ValueError): check.mutations_sanity({"1":["A"]}) # Should fail because it's a dict, even if indexable properly with pytest.raises(ValueError): check.mutations_sanity({0:["A"],1:["B"]}) # Empty list value with pytest.raises(ValueError): check.mutations_sanity([[],["A"]]) # mismatch with wildtype with pytest.raises(ValueError): check.mutations_sanity([["A","B"]],wildtype="AA") # should not throw error check.mutations_sanity([["A","B"]]) check.mutations_sanity([["A","B"]],wildtype="A") # Die because genotype has character ("C") that is not in the mutations with pytest.raises(ValueError): check.mutations_sanity([["A","B"]],wildtype="A",genotype=["C"]) # Die because wildtype has character ("C") that is not in the mutations with pytest.raises(ValueError): check.mutations_sanity([["A","B"]],wildtype="C",genotype=["A"]) # This should work check.mutations_sanity([["A","B"]],wildtype="A",genotype=["B"])
def get_encoding_table(wildtype, mutations, site_labels=None): """ This function constructs a lookup table (pandas.DataFrame) for mutations in a given mutations list. This table encodes mutations with a binary representation. Parameters ---------- wildtype : str string with wildtype sequence mutations : list mutation list of lists containing possible states at each site. (Must be same length as wt sequence). site_labels : list or None list of labels for each site. If None, use sequential integers starting at 0. If not none, list must have same length as wildtype, all entires must be unique. Returns ------- df : pandas.DataFrame dataframe holding various representations of the gp map. Notes ----- The table has the following columns: + genotype_index : this corresponds to the number counting across the genotype (starting at zero and going to length - 1) + wildtype_letter : character for wildtype at this position + mutation_letter : character for mutation at this position (for wildtype row, this will be same as wildtype_letter) + binary_repr : binary representation of this site. If there are two characters at the genotype site (wildtype and mutant) this will be "0" for wildtype and "1" for mutant. If there are three characters (wildtype, mutant1, mutant2), this will be "00", "10", "01" for wildtype, mutant1, and mutant2 rows respectively. For wildtype and three mutants, this will be "000", "100","010","001". This binary representation allows this genotype phenotype map to be fed directly into the epistasis package, which assumes each site is binary. binary_index_start : start of region of binary string corresponding to this gene site. binary_index_stop : end of region of binary string corresponding to this gene site. mutation_index : counter that gives each amino acid state a unique number. site_label : label for site. Used for constructing mutant names. For example, for the mutant A73G, A would be the wildtype_letter, G would be the mutation_letter, and 73 would be the site label. If not specified by the user, the site label is a number counting from zero. Otherwise, it is the user-specified label. """ check.wildtype_sanity(wildtype) # Check mutations input check.mutations_sanity(mutations,wildtype=wildtype) # Make fake site labels if not specified if site_labels is None: site_labels = list(range(len(wildtype))) check.site_labels_sanity(site_labels,wildtype) # Initialize table table = [] mutation_index_counter = 1 binary_index_counter = 0 for genotype_index, alphabet in enumerate(mutations): # Set genotype_index as int genotype_index = int(genotype_index) # Create local alphabet copy where wildtype state is the first element # regardless of its position in the alphabet. local_alphabet = alphabet[:] if local_alphabet[0] != wildtype[genotype_index]: local_alphabet.remove(wildtype[genotype_index]) local_alphabet.insert(0,wildtype[genotype_index]) binary_size = len(local_alphabet) - 1 for i, a in enumerate(local_alphabet): tmp_binary_repr = ["0" for _ in range(binary_size)] if i == 0: mutation_index = None else: mutation_index = mutation_index_counter mutation_index_counter += 1 tmp_binary_repr[i-1] = "1" binary_repr = "".join(tmp_binary_repr) table.append( dict( genotype_index=genotype_index, wildtype_letter=wildtype[genotype_index], mutation_letter=a, binary_repr=binary_repr, binary_index_start=binary_index_counter, binary_index_stop=binary_index_counter + binary_size, mutation_index=mutation_index, site_label=site_labels[genotype_index] ) ) binary_index_counter += binary_size # Turn table into DataFrame. df = pd.DataFrame(table) df.genotype_index = df.genotype_index.astype('Int64') df.mutation_index = df.mutation_index.astype('Int64') df.binary_index_start = df.binary_index_start.astype('Int64') df.binary_index_stop = df.binary_index_stop.astype('Int64') return df