def test__genotypes_to_X(test_data): # Make sure function catches bad genotype passes d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) # Duplicated g = list(gpm.genotype) g.extend(g) # not in gpmap b = list(gpm.genotype) b.append("stupid") bad_genotypes = [g, b] for bad in bad_genotypes: with pytest.raises(ValueError): models.base._genotypes_to_X(bad, gpm, order=1, model_type="local") # Sample through various model comobos allowed = {"local": set([0, 1]), "global": set([-1, 1])} for d in test_data: gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) for i in range(1, gpm.length + 1, 1): for model_type in ["local", "global"]: X = models.base._genotypes_to_X(gpm.genotype, gpm, order=i, model_type=model_type) assert X.shape[0] == len(gpm.genotype) assert set(np.unique(X)).issubset(allowed[model_type])
def test_synchronize(test_data): # Should work even without gpm G = gpmap.GenotypePhenotypeGraph() assert G.synchronize() is None for d in test_data: # Make map G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) # Delete a node using DiGraph method and make sure it's really gone by # calling the super nodes property. (The GenotypePhenotypeGraph.nodes # property will call sync_nodes implicitly). super(gpmap.GenotypePhenotypeGraph,G).remove_node(0) assert len(super(gpmap.GenotypePhenotypeGraph,G).nodes) == len(d["genotype"]) - 1 # Should be back because gpm.data always wins G.synchronize() assert len(G.nodes) == len(d["genotype"]) # Delete a genotype directly from gpm.data mask = np.arange(1,len(G.gpm.data),dtype=int) G.gpm._data = G.gpm.data.loc[mask,:] # Should now lose a node. G.synchronize() assert len(G.nodes) == len(d["genotype"]) - 1 for d in test_data: # Make map G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) num_neighbors = len(G.gpm.neighbors) # Delete an edge using DiGraph method and make sure it's really gone by # calling the super edges property. (The GenotypePhenotypeGraph.edges # property will call sync_nodes implicitly). super(gpmap.GenotypePhenotypeGraph,G).remove_edge(*list(G.edges)[0]) assert len(super(gpmap.GenotypePhenotypeGraph,G).edges) == num_neighbors - 1 # Should be back because gpm.data always wins G.synchronize() assert len(G.edges) == num_neighbors # Delete an edge directly from gpm.neighbors mask = np.arange(len(G.gpm.neighbors)-1,dtype=int) G.gpm._neighbors = G.gpm.neighbors.loc[mask,:] # Should now lose an edge. G.synchronize() assert len(G.edges) == num_neighbors - 1
def test_constructor(test_data): G = base.GenotypePhenotypeGraph() assert isinstance(G,nx.DiGraph) assert G.gpm is None attributes = [G.node_options,G.edge_options, G.node_label_options,G.edge_label_options] for a in attributes: assert type(a) is dict assert len(a) > 0 with pytest.raises(KeyError): G.node_options["nodelist"] with pytest.raises(KeyError): G.node_options["edgelist"] assert G.node_options["node_size"] == G._default_node_size assert G.edge_options["node_size"] == G._default_node_size assert G.edge_options["arrows"] == False # Test bad gpm inputs bad_inputs = ["stupid",1,[],(1,),1.1] for b in bad_inputs: with pytest.raises(TypeError): G = base.GenotypePhenotypeGraph(gpm=b) # Send in good genotype phenotype map. does not have weights. d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G = base.GenotypePhenotypeGraph(gpm=gpm) G.gpm.data G.gpm.neighbors # Pass bad edge weight columns gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) with pytest.raises(ValueError): G = base.GenotypePhenotypeGraph(gpm=gpm,edge_weight_column="not_good") gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) with pytest.raises(ValueError): G = base.GenotypePhenotypeGraph(gpm=gpm,edge_weight_column=1.1) # Should work and have weighted edges gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G = base.GenotypePhenotypeGraph(gpm=gpm) for g in G.edges: G.edges[g[0],g[1]]["weight"]
def test_nodes_getter(test_data): # Make sure getter works. note it runs sync_nodes # Should work even without gpm G = gpmap.GenotypePhenotypeGraph() assert len(G.nodes) == 0 for d in test_data: # Make map G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) # Delete a node using DiGraph method. It should not delete from node_list # because node_list getter runs sync_nodes() super(gpmap.GenotypePhenotypeGraph,G).remove_node(0) assert len(G.nodes) == len(d["genotype"]) # Delete a genotype directly from gpm.data old_data = G.gpm._data.copy() mask = np.arange(len(G.gpm.data)-1,dtype=int) G.gpm._data = G.gpm.data.loc[mask,:] # Should now lose a node. assert len(G.nodes) == len(d["genotype"]) - 1 # Put node back in via data frame. nodes should update. G.gpm._data = old_data assert len(G.nodes) == len(d["genotype"])
def test_add_remove_node_cmap(test_data): """ Test both add_node_cmap and remove_node_cmap methods. """ G = gpmap.GenotypePhenotypeGraph() with pytest.raises(RuntimeError): G.add_node_cmap("test") for d in test_data: G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) with pytest.raises(KeyError): G.add_node_cmap(data_column="not_a_column") with pytest.raises(ValueError): G.add_node_cmap(data_column="phenotype",cmap="not_a_cmap") G.add_node_cmap(data_column="phenotype",cmap="plasma") node_options = copy.deepcopy(G.node_options) assert type(node_options["node_color"]) is tuple assert node_options["node_color"][0] == "_gpm" assert node_options["node_color"][1] == "phenotype" assert type(node_options["node_color"][2]) is type(matplotlib.cm.get_cmap("plasma")) assert node_options["node_color"][3] == np.min(d["phenotype"]) assert node_options["node_color"][4] == np.max(d["phenotype"]) # Now pass in cmap as cm object G = gpmap.GenotypePhenotypeGraph() G.add_gpm(gpm) cmap = matplotlib.cm.get_cmap("plasma") G.add_node_cmap(data_column="phenotype",cmap=cmap) node_options = copy.deepcopy(G.node_options) assert type(node_options["node_color"]) is tuple assert node_options["node_color"][0] == "_gpm" assert node_options["node_color"][1] == "phenotype" assert type(node_options["node_color"][2]) is type(matplotlib.cm.get_cmap("plasma")) assert node_options["node_color"][3] == np.min(d["phenotype"]) assert node_options["node_color"][4] == np.max(d["phenotype"]) G.add_node_cmap(data_column="phenotype",cmap=cmap,vmin=5,vmax=10) node_options = copy.deepcopy(G.node_options) assert type(node_options["node_color"]) is tuple assert node_options["node_color"][0] == "_gpm" assert node_options["node_color"][1] == "phenotype" assert type(node_options["node_color"][2]) is type(matplotlib.cm.get_cmap("plasma")) assert node_options["node_color"][3] == 5 assert node_options["node_color"][4] == 10 G.remove_node_cmap() removed_options = ["vmin","vmax","cmap"] for r in removed_options: with pytest.raises(KeyError): G.node_options[r] assert G.node_options["node_color"] == "gray"
def test_set_edge_label_options(test_data): G = base.GenotypePhenotypeGraph() # Get available options avail_options = copy.deepcopy(G.edge_label_options) # Set individually to 1. for a in avail_options: G.set_edge_label_options(**{a:1}) assert G.edge_label_options[a] == 1 # Set all at once and make sure they match G.set_edge_label_options(**avail_options) for a in avail_options: assert G.edge_label_options[a] == avail_options[a] # pass bad options bad_options = ["not_a_key"] for b in bad_options: with pytest.raises(KeyError): G.set_edge_label_options(**{b:1}) # test exotic edge_label setting d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(d["genotype"])
def test_encoding_to_sites(test_data): d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) gpm.encoding_table #order, encoding_table, start_order start_orders = [0, 1] L = len(d["genotype"][0]) orders = range(1, L + 1) for s in start_orders: for o in orders: sites = epistasis.mapping.encoding_to_sites( order=o, encoding_table=gpm.encoding_table, start_order=s) # Make sure generating write number of terms (n choose k for each # order plus 1 if we are starting at order 0). num_terms = 0 for i in range(1, o + 1): num_terms += scipy.special.comb(L, i) if s == 0: num_terms += 1 assert len(sites) == num_terms
def test__thetas_arghandler(test_data): m = models.linear.EpistasisLinearRegression() d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], coolness=d["phenotype"], uncertainty=d["phenotype"]) m.add_gpm(gpm, phenotype_column="coolness", uncertainty_column="uncertainty") # No thetas calcualted yet with pytest.raises(RuntimeError): m._thetas() m.fit() # Get thetas, calcualted t = m._thetas() assert len(t) == 4 # pass in general badness bad_passes = [np.ones((1, 1, 1)), [], "stupid", 1, 1.1, ()] for b in bad_passes: with pytest.raises(TypeError): print(f"trying {b}") m._thetas(b) y = m._thetas([1.0]) assert np.array_equal(y, [1.0])
def test__lnprior(test_data): m = models.linear.EpistasisLinearRegression() with pytest.raises(ValueError): m._lnprior() d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], coolness=d["phenotype"], uncertainty=d["phenotype"]) m.add_gpm(gpm, phenotype_column="coolness", uncertainty_column="uncertainty") x = m._lnprior() assert np.array_equal(x, np.zeros(len(d["genotype"]))) # pass in general badness bad_passes = [np.ones((1, 1, 1)), [], "stupid", 1, 1.1, ()] for b in bad_passes: with pytest.raises(TypeError): print(f"trying {b}") m._lnprior(b) y = m._lnprior([1.0]) assert np.array_equal(y, [1.0])
def test_abstractmodel_predict_to_df(test_data): """ Test basic functionality. Real test of values will be done on .predict for subclasses. """ m = models.linear.EpistasisLinearRegression() d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m.add_gpm(gpm) # This should fail -- no fit run with pytest.raises(Exception): df = m.predict_to_df() m.fit() # This should work df = m.predict_to_df() assert type(df) is type(pd.DataFrame()) assert len(df) == len(d["genotype"]) # Create and fit a new model. m = models.linear.EpistasisLinearRegression() gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) # No gpm added -- should fail with pytest.raises(RuntimeError): m.predict_to_df() m.add_gpm(gpm) m.fit() df = m.predict_to_df(genotypes=d["genotype"][0]) assert len(df) == 1 bad_stuff = [1, {}, [1, 2], "STUPID", ["STUPID", "IS", "REAL"]] for b in bad_stuff: with pytest.raises(ValueError): print(f"Trying bad genotypes {b}") m.predict_to_df(genotypes=b) df = m.predict_to_df(genotypes=d["genotype"][:3]) assert len(df) == 3
def test_gpm_getter(): G = base.GenotypePhenotypeGraph() assert G.gpm is None gpm = gpmap.GenotypePhenotypeMap(["AA"]) G.add_gpm(gpm) assert G.gpm is gpm
def test_edge_weight_column_getter(): gpm = gpmap.GenotypePhenotypeMap(genotype=["00","11"]) G = gpmap.GenotypePhenotypeGraph(gpm) G.edge_weight_column = "weight" assert G.edge_weight_column == "weight" G.edge_weight_column = None assert G.edge_weight_column == None gpm = gpmap.GenotypePhenotypeMap(genotype=["00","01"]) gpm.get_neighbors() gpm.neighbors.loc[:,"test"] = np.ones(len(gpm.neighbors)) G = gpmap.GenotypePhenotypeGraph(gpm,edge_weight_column="test") assert G.edge_weight_column == "test"
def test__X_arghandler(test_data): m = models.linear.EpistasisLinearRegression() with pytest.raises(ValueError): m._X() d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"], uncertainty=d["phenotype"]) m.add_gpm(gpm) # Make sure calling _X() naked-ly populates previous_X assert m._previous_X is None X = m._X() assert m._previous_X is X # If we access after having run, make sure X is the same object assert X is m._X() # Should wipe out previous_X and force recalculation. m.add_gpm(gpm) assert X is not m._X() # Get x for single genotype. should work. should not update _previous_X X = m._X(d["genotype"][0]) assert len(X) == 1 assert X is not m._previous_X # Get x for two genotypes. should work and not update _previous_X X = m._X(d["genotype"][0:2]) assert len(X) == 2 assert X is not m._previous_X # Get x for two genotypes. should work and not update _previous_X X = m._X(np.array(d["genotype"][0:2])) assert len(X) == 2 assert X is not m._previous_X # Just keep the array, do not update previous_X hack = np.ones((1, 1)) X = m._X(data=hack) assert X is hack assert X is not m._previous_X # pass in bad genotypes with pytest.raises(ValueError): X = m._X("NOT_A_GENOTYPE") with pytest.raises(ValueError): X = m._X([d["genotype"][0], "NOT_A_GENOTYPE"]) # pass in general badness bad_passes = [np.ones((1, 1, 1)), [], "stupid", 1, 1.1, ()] for b in bad_passes: with pytest.raises(ValueError): m._X(b)
def test_gpm_getter(test_data): d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m = models.linear.EpistasisLinearRegression() assert m.gpm is None m.add_gpm(gpm) assert m.gpm is gpm
def test__encode_vectors(test_data): for d in test_data: for model_type in ["global", "local"]: gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"]) vectors = mm._encode_vectors(gpm.binary, model_type=model_type) # Make sure outpu is what is expected assert type(vectors) is np.ndarray assert vectors.shape[0] == len(gpm.binary) assert vectors.shape[1] == len(gpm.binary[0]) + 1 assert np.array_equal(d[f"{model_type}_encoding"], vectors) # Send bad model matrix gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"]) with pytest.raises(ValueError): vectors = mm._encode_vectors(gpm.binary, model_type="stupid")
def test_results_getter(test_data): d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m = models.linear.EpistasisLinearRegression() m.add_gpm(gpm) assert m.results is None m.fit() assert isinstance(m.results, pd.DataFrame)
def test_get_neighbors(): gpm = gpmap.GenotypePhenotypeMap(genotype=["SG", "PF", "SF", "PG"]) genotype = gpm._data.loc[:, "genotype"] # Must pass genotype with pytest.raises(TypeError): get_neighbors_cython.get_neighbors() # should run get_neighbors_cython.get_neighbors(genotype) # check neighbor function sanity with pytest.raises(ValueError): get_neighbors_cython.get_neighbors(genotype, neighbor_function="not_real") # check cutoff sanity bad_cutoff = [-2, "stupid", [], None] for b in bad_cutoff: with pytest.raises(ValueError): get_neighbors_cython.get_neighbors(genotype, cutoff=b) # Hamming distance of one hamming_edges = [(0, 0), (0, 2), (2, 0), (0, 3), (3, 0), (1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (2, 2), (3, 3)] source, target = get_neighbors_cython.get_neighbors(genotype, "hamming", cutoff=1) for i in range(len(source)): assert np.array_equal(hamming_edges[i], (source[i], target[i])) # Hamming distance of two hamming_edges = [(0, 0), (0, 1), (1, 0), (0, 2), (2, 0), (0, 3), (3, 0), (1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (2, 2), (2, 3), (3, 2), (3, 3)] source, target = get_neighbors_cython.get_neighbors(genotype, "hamming", cutoff=2) for i in range(len(source)): assert np.array_equal(hamming_edges[i], (source[i], target[i])) # aa distane of one codon_edges = [(0, 0), (0, 3), (3, 0), (1, 1), (1, 2), (2, 1), (2, 2), (3, 3)] source, target = get_neighbors_cython.get_neighbors(genotype, "codon", cutoff=1) for i in range(len(source)): assert np.array_equal(codon_edges[i], (source[i], target[i]))
def test_edges_getter(test_data): # Should work even without gpm G = gpmap.GenotypePhenotypeGraph() assert len(G.edges) == 0 for d in test_data: # Make map G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) new_gpm_neighbors = gpm.neighbors.copy() num_edges = len(G.gpm.neighbors) # Delete a edge using DiGraph method. It should not delete from edges # because edges getter runs sync_edges() e = list(super(gpmap.GenotypePhenotypeGraph,G).edges)[0] super(gpmap.GenotypePhenotypeGraph,G).remove_edge(*e) assert len(G.edges) == num_edges # Delete a genotype directly from gpm.data mask = np.arange(len(G.gpm.neighbors)-1,dtype=int) G.gpm._neighbors = G.gpm.neighbors.loc[mask,:] # Should now lose a edge. assert len(G.edges) == num_edges - 1 # Add full gpm, which will effectively add one genotype back in G.gpm._neighbors = new_gpm_neighbors.copy() assert len(G.edges) == num_edges # Set weight G.edge_weight_column = "weight" for e in G.edges: assert G.edges[e[0],e[1]]["weight"] == 1 # Remove weight G.edge_weight_column = None for e in G.edges: with pytest.raises(KeyError): G.edges[e[0],e[1]]["weight"]
def test_column_getters(test_data): d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"], uncertainty=d["phenotype"]) m = models.linear.EpistasisLinearRegression() assert m.genotype_column is None assert m.phenotype_column is None assert m.uncertainty_column is None m.add_gpm(gpm, uncertainty_column="uncertainty") assert m.genotype_column == "genotype" assert m.phenotype_column == "phenotype" assert m.uncertainty_column == "uncertainty"
def test_abstractmodel_predict_to_excel(test_data, tmp_path): m = models.linear.EpistasisLinearRegression() d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m.add_gpm(gpm) m.fit() excel_file = os.path.join(tmp_path, "tmp.xlsx") m.predict_to_excel(filename=excel_file) assert os.path.exists(excel_file) df = pd.read_excel(excel_file) assert len(df) == len(d["genotype"]) # Make sure genotypes pass works m.predict_to_excel(filename=excel_file, genotypes=d["genotype"][0]) assert os.path.exists(excel_file) df = pd.read_excel(excel_file) assert len(df) == 1
def test_add_remove_edge_labels(test_data): # Throw error because no gpmap G = gpmap.GenotypePhenotypeGraph() with pytest.raises(RuntimeError): G.add_edge_labels("test") for d in test_data: # Make map G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) assert G.gpm is gpm # Throw error data column is not real with pytest.raises(KeyError): G.add_edge_labels(data_column="not_a_column") # This should work G.add_edge_labels(data_column="weight") # This should not work with pytest.raises(ValueError): G.add_edge_labels(data_column="weight",fmt="{:d}") G.add_edge_labels(data_column="weight") expected = ("_gpm","weight","{:.3f}") assert np.array_equal(expected,G.edge_label_options["edge_labels"]) # Check remove_edge_sizemap G.remove_edge_labels() with pytest.raises(KeyError): G.edge_label_options["edge_labels"] # Make sure fmt pass works G.add_edge_labels(data_column="weight",fmt="{}") expected = ("_gpm","weight","{}") assert np.array_equal(expected,G.edge_label_options["edge_labels"])
def test_edge_weight_column_setter(test_data): G = gpmap.GenotypePhenotypeGraph() with pytest.raises(RuntimeError): G.edge_weight_column = "weight" for d in test_data: gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G.add_gpm(gpm) with pytest.raises(KeyError): G.edge_weight_column = "stupid" with pytest.raises(TypeError): G.edge_weight_column = "edge" # should work G.edge_weight_column = "weight" for g in G.edges: print(g) G.edges[g[0],g[1]]["weight"]
def test_get_model_matrix(test_data): for d in test_data: gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"]) for model_type in ["global", "local"]: for i in range(gpm.length): # Get sites for this site sites = mapping.encoding_to_sites(i + 1, gpm.encoding_table) for use_cython in [True, False]: X = m.get_model_matrix(gpm.binary, sites, model_type=model_type, use_cython=use_cython) if use_cython: cython_X = np.copy(X) else: # Make sure python and cython give same answer assert np.array_equal(cython_X, X)
def test_pyplot_plot(test_data): d = test_data[0] # Feed in GenotypePhenotypeMap gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm) assert isinstance(G, gpmap.GenotypePhenotypeGraph) assert isinstance(fig, matplotlib.figure.Figure) assert isinstance(ax, matplotlib.axes.Axes) plt.close() # Feed in GenotypePhenotypeGraph gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G_in = gpmap.GenotypePhenotypeGraph() G_in.add_gpm(gpm) G, fig, ax = gpmap.plot(G_in) assert G is G assert gpm is G.gpm assert isinstance(G, gpmap.GenotypePhenotypeGraph) assert isinstance(fig, matplotlib.figure.Figure) assert isinstance(ax, matplotlib.axes.Axes) plt.close() # Feed in bad stuff bad_args = ["test", gpmap.GenotypePhenotypeGraph, (1, 23), 14] for b in bad_args: with pytest.raises(TypeError): gpmap.plot(b) # Feed in all combinations of plot modes... bool_args = [ "plot_nodes", "plot_edges", "plot_node_labels", "plot_edge_labels" ] for i in range(len(bool_args)): for c in itertools.combinations(bool_args, i + 1): bool_kwargs = dict([(k, True) for k in bool_args]) for k in c: bool_kwargs[k] = False gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, **bool_kwargs) assert isinstance(G, gpmap.GenotypePhenotypeGraph) assert isinstance(fig, matplotlib.figure.Figure) assert isinstance(ax, matplotlib.axes.Axes) plt.close() # Check passing node options gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, node_options={"node_size": 5}) plt.close() assert G.node_options["node_size"] == 5 with pytest.raises(TypeError): gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, node_options="not_right_type") # Check passing edge options gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, edge_options={"style": "--"}) plt.close() assert G.edge_options["style"] == "--" with pytest.raises(TypeError): gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, edge_options="not_right_type") # Check passing node label options gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, node_label_options={"font_size": 14}) plt.close() assert G.node_label_options["font_size"] == 14 with pytest.raises(TypeError): gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, node_label_options="not_right_type") # Check passing edge label options gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, edge_label_options={"font_size": 14}) plt.close() assert G.edge_label_options["font_size"] == 14 with pytest.raises(TypeError): gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, edge_label_options="not_right_type") # Test figsize setting gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G, fig, ax = gpmap.plot(gpm, figsize=(2, 2)) assert np.array_equal(fig.get_size_inches(), (2, 2)) plt.close() bad_fig_size = [(1, 2, 3), (1, ), "stupid", "RA", 5] for b in bad_fig_size: with pytest.raises(ValueError): G, fig, ax = gpmap.plot(gpm, figsize=b) # test ax pass G, fig, ax = gpmap.plot(gpm) G, fig, ax2 = gpmap.plot(gpm, ax=ax) assert ax is ax2 with pytest.raises(TypeError): G, fig, ax = gpmap.plot(gpm, ax="stupid") plt.close() plt.close()
def generate_gpm(wildtype=None, mutations=None, site_labels=None, num_sites=5, num_states_per_site=2, alphabet=None, max_genotypes=131072): r""" Generate a GenotypePhenotypeMap with various genotypes. wildtype : string wildtype sequence. if not specified, construct arbitrary wildtype from mutations list. mutations : list-like If specified, this overrides num_sites and num_states_per_site. This can take have two forms: 1) Just like a typical GenotypePhenotypeMap call, this could be a list of lists, with one list per site. The internal lists hold the mutation alphabet for each site. For example, mutations=[["A","B"],["A"]] would create a map with states "A" and "B" at the first site and state "A" at the second site. NOTE: if wildtype is specified, its states must be in the mutations list. A wildtype sequence "BA" would be compatible with the mutation list above, but "BB" would not because there is no "B" in the second position. 2) This could be a list of ints, where the ints indicate how many states to give each site. Thus, mutations=[2,1] would create a map with two states at the first site and one state at the second site. site_labels : array-like list of labels to apply to sites. If this is not specified, the first site is assigned a label 0, the next 1, etc. If specified, sites are assigned labels in the order given. For example, if the genotypes specify mutations at positions 12 and 75, this would be a list [12,75]. num_sites : int Number of sites to give each genotype. If mutations are given, this is ignored; if mutations are not given, this is required. num_states_per_site : int Number of states to assign each site in a genotype. If mutations are given, this is ignored; if mutations are not given, this is required. If num_states_per_site was 2 and num_sites was 5, this would generate a map with 5 sites in two possible state (2^5 genotypes). To specify different numbers of states at each site, use the mutations argument. alphabet : str letters to use for generating genotypes. This is used for map construction unless mutations is a list of lists containing states, in which case this argument is ignored. This can have two forms: 1) key for a pre-defined alphabet: 'aa':'ACDEFGHIKLMNPQRSTVWY', 'dna':'ACGT','rna':'ACGU', or 'number':'0123456789' 2) A string of unique letters (for example, "ACGTU" or "1XyzP") max_genotypes : int do not create a GenotypePhenotypeMap that has more than max_genotypes genotypes. (This check is in place to avoid accidentally constructing a truly massive combinatorial map.) If map will be too big, throws a RuntimeError. To disable this check, set to None. Default is 2^17 = 131072. """ # ------------------------------------------------------------------------ # Deal with alphabet. This will only be used if we have to construct a # mutations list, but has to be done first so we can process mutations # argument properly. # ------------------------------------------------------------------------ if alphabet is None: alphabet = list(ALPHABETS["aa"]) else: err = None # See if it's a known alphabet try: alphabet = list(ALPHABETS[alphabet]) # Not a known alphabet. Make sure we can turn into a list of strings. except (TypeError, KeyError): try: alphabet = list(alphabet) for i in range(len(alphabet)): alphabet[i] = f"{alphabet[i]}" if len(alphabet[i]) > 1: err = f"alphabet state {alphabet[i]} more than one letter\n" break except (ValueError, TypeError): err = f"problem parsing alphabet {alphabet}\n" # Make sure the alphabet has unique letters. if err is None: if len(alphabet) != len(set(alphabet)): err = "alphabet contains non-unique letters\n" if err is not None: err += "\nalphabet can have two forms. 1) a string indicating a \n" err += "built in alphabet (aa, dna, rna, number); 2) a string of \n" err += "unique letters\n" raise ValueError(err) # ------------------------------------------------------------------------ # Deal with wildtype entry. We need to make sure a generated mutations # list has wildtype entries or that a specified mutations entry has the # wildtype states. # ------------------------------------------------------------------------ if wildtype is not None: try: wildtype = [f"{w}" for w in list(wildtype)] if set([len(w) for w in wildtype]) != set([1]): raise TypeError if len(wildtype) == 0: raise TypeError except (TypeError, ValueError): err = f"wildtype '{wildtype}' could not be interpreted as a list of\n" err += "single-character strings. len(wildtype) must be > 0.\n" raise ValueError(err) # ------------------------------------------------------------------------ # If mutations is specified, do some sanity checking and/or construction. # ------------------------------------------------------------------------ if mutations is not None: err = None # Make sure mutations is iterable try: new_num_sites = len(mutations) new_mutations = [None for _ in range(new_num_sites)] except TypeError: err = f"mutations '{mutations}' is not iterable.\n" # Check for length match with wildtype if err is None: if wildtype is not None: if len(wildtype) != new_num_sites: err = "mutations does not match length of specified wildtype\n" if err is None: if new_num_sites == 0: err = "mutations has no sites\n" # If we get here, so far so good... if err is None: # Now iterate over mutations. for i, site in enumerate(mutations): # Try to interpret mutations as a list of single-character # strings corresponding to states at the site. try: states_at_site = [f"{s}" for s in site] if len(set(states_at_site)) != len(states_at_site): err = "site '{site}' has non-unique mutations\n" break if set([len(s) for s in states_at_site]) != set([1]): err = "not all states at site '{site}' can be turned\n" err += "into single-character strings\n" break # Make sure wildtype state is in the states at this site. if wildtype is not None: if not set(wildtype[i]).issubset(set(states_at_site)): err = f"wildtype state {wildtype[i]} not in mutations\n" err += f"({states_at_site})\n" break # If we get here, mutations at site i passed quality control new_mutations[i] = states_at_site[:] # If we can't interpret site as a list of states -- maybe it's # an integer specifying the number of states at the site? except (TypeError, ValueError): # Try to coerce site into an integer. try: new_num_states = int(site) # Is it too big of integer to specify states given # alphabet? if new_num_states > len(alphabet): err = "site '{site}' requests more states than are \n" err += "in the specified alphabet '{alphabet}'\n" break # Is it less than one? if new_num_states < 1: err = "site '{site}' not > 0\n" break # Nope, not an integer. Die. except (ValueError, TypeError): err = f"could not interpret site '{site}' in mutations vector\n" break # Generate states for this site, making sure the wildtype # state is included if wildtype was specified. if wildtype is None: site_states = [] local_alphabet = alphabet[:] else: site_states = [wildtype[i]] new_num_states = new_num_states - 1 local_alphabet = [ a for a in alphabet if a != wildtype[i] ] # If we need to add more than the wildtype state, choose # randomly from the alphabet if new_num_states > 0: site_states.extend( _sample_alphabet(local_alphabet, new_num_states)) new_mutations[i] = copy.deepcopy(site_states) if err is not None: err += "\nmutations should be a list indicating the mutations to \n" err += "allow at each site. It can have two forms: a list of lists\n" err += "or a list of ints. If a list of lists, the length of the \n" err += "outer list determines the number of sites, while the inner\n" err += "lists indicate the states possible at each site. These \n" err += "states must be unique within the site and be able to be \n" err += "coerced into single-letter characters. If a list of ints,\n" err += "the list length determines the number of sites and the\n" err += "int indicates the number of states at that site. The\n" err += "states are selected randomly from the chosen alphabet.\n" raise ValueError(err) num_sites = new_num_sites mutations = copy.deepcopy(new_mutations) # ------------------------------------------------------------------------ # If no mutations are specified, build it from num_sites and # num_states_per_site. Pull in wildtype states if that was specified. # ------------------------------------------------------------------------ if mutations is None: # Make sure number of sites is sane. try: num_sites = int(num_sites) if num_sites < 1: raise TypeError except TypeError: err = "num_sites should be an integer > 0\n" raise ValueError(err) # Make sure wildtype and num_sites have the same length if wildtype is not None: if len(wildtype) != num_sites: err = f"wildtype '{wildtype}' is not the same length as \n" err += f"num_sites '{num_sites}'\n" raise ValueError(err) # Make sure the number of states per site is sane try: num_states_per_site = int(num_states_per_site) if num_states_per_site < 1: raise TypeError if num_states_per_site > len(alphabet): raise TypeError except TypeError: err = "num_states_per_site should be an integer > 0 and\n" err += f"<= the alphabet size. Current alphabet: {alphabet}\n" raise ValueError(err) mutations = [] for i in range(num_sites): # Make sure the site has the wildtype state if specified if wildtype is None: mutations.append([]) local_alphabet = alphabet[:] num_states = num_states_per_site else: mutations.append([wildtype[i]]) local_alphabet = [a for a in alphabet if a != wildtype[i]] num_states = num_states_per_site - 1 # If we need to add more than the wildtype state, choose # randomly from the alphabet if num_states > 0: mutations[i].extend( _sample_alphabet(local_alphabet, num_states)) # ------------------------------------------------------------------------ # Wildtype, again # ------------------------------------------------------------------------ # If we still don't have wildtype, create it from the mutations list if wildtype is None: wildtype = [m[0] for m in mutations] wildtype = "".join(wildtype) # Note, for site labels, let GenotypePhenotypeMap object check sanity. # ------------------------------------------------------------------------ # Do last sanity check on size. Avoid a massive combinatorial explosion... # ------------------------------------------------------------------------ map_size = np.product([len(m) for m in mutations]) if max_genotypes is None: max_genotypes = np.inf else: try: max_genotypes = int(max_genotypes) except TypeError: err = "max_genotypes should be an integer or None\n" raise ValueError(err) if map_size > max_genotypes: err = "This command will generate a genotype phenotype map with more\n" err += f"{map_size} genotypes. This is larger than max_genotypes\n" err += f"({max_genotypes}). To generate this map, either choose\n" err += "parameters that will lead to a smaller map or increase\n" err += "max_genotypes. To disable this check, set max_genotypes to None.\n" raise RuntimeError(err) # Generate vector of genotypes from mutations list genotype = gpmap.utils.mutations_to_genotypes(mutations) # Generate and return the GenotypePhenotypeMap. return gpmap.GenotypePhenotypeMap(genotype, wildtype=wildtype, site_labels=site_labels)
def test_abstractmodel_add_gpm(test_data): d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m = models.linear.EpistasisLinearRegression() bad_gpm = [1, None, "test", [], {}] for b in bad_gpm: with pytest.raises(TypeError): m.add_gpm(b) m.add_gpm(gpm) # Test genotype_column arg d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m = models.linear.EpistasisLinearRegression() bad_genotype_column = [1, None, [], {}, (1, )] for b in bad_genotype_column: with pytest.raises(TypeError): print(f"trying {b}") m.add_gpm(gpm, genotype_column=b) with pytest.raises(KeyError): m.add_gpm(gpm, genotype_column="not_a_column") m.add_gpm(gpm, genotype_column="genotype") assert m.genotype_column == "genotype" # Test phenotype_column arg d = test_data[0] gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"]) m = models.linear.EpistasisLinearRegression() # Shouldn't work b/c no float column with pytest.raises(ValueError): m.add_gpm(gpm) # Shouldn't work because there is no column with that name gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) with pytest.raises(KeyError): m.add_gpm(gpm, phenotype_column="not_real") # Shouldn't work because column is not numeric gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["genotype"]) with pytest.raises(ValueError): m.add_gpm(gpm, phenotype_column="phenotype") # Make sure it gets right column (first float that is not reserved) gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], coolness=d["phenotype"], something_else=d["phenotype"]) m.add_gpm(gpm) assert m.phenotype_column == "coolness" # Test uncertainty_column arg. # Do default = None gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"]) m.add_gpm(gpm) assert m.uncertainty_column == "epi_zero_uncertainty" unc = np.array(m.gpm.data.loc[:, "epi_zero_uncertainty"]) assert len(np.unique(unc)) == 1 assert np.isclose(unc[0], np.min(gpm.data.loc[:, m.phenotype_column]) * 1e-6) # pass missing column gpm = gpmap.GenotypePhenotypeMap(genotype=d["genotype"], phenotype=d["phenotype"], coolness=d["phenotype"], not_float=d["genotype"]) # Send in same as phenotype with pytest.raises(ValueError): m.add_gpm(gpm, uncertainty_column="phenotype") # send in not there with pytest.raises(KeyError): m.add_gpm(gpm, uncertainty_column="not_there") # send in not float with pytest.raises(ValueError): m.add_gpm(gpm, uncertainty_column="not_float") # Shoud work m.add_gpm(gpm, uncertainty_column="coolness") assert m.uncertainty_column == "coolness" # Check final output assert m.gpm is gpm assert m.Xcolumns is not None assert m.epistasis is not None assert m._previous_X is None
def test_add_remove_edge_sizemap(test_data): """ Test add_edge_sizemap and remove_edge_sizemap methods. """ # Throw error because no gpmap G = gpmap.GenotypePhenotypeGraph() with pytest.raises(RuntimeError): G.add_edge_sizemap("test") for d in test_data: # Make map G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G.add_gpm(gpm) assert G.gpm is gpm # Throw error data column is not real with pytest.raises(KeyError): G.add_edge_sizemap(data_column="not_a_column") # Throw error because vmin > vmax with pytest.raises(ValueError): G.add_edge_sizemap(data_column="weight",vmin=1000,vmax=10) # Throw error because size is < 0 with pytest.raises(ValueError): G.add_edge_sizemap(data_column="weight",size_min=-2) # Throw error because size_max < size_min with pytest.raises(ValueError): G.add_edge_sizemap(data_column="weight",size_max=2,size_min=10) # This should work G.gpm.neighbors.loc[:,"test"] = np.random.random(len(gpm.neighbors)) G.add_edge_sizemap(data_column="test") # Make sure min/max doing what we think for defaults mn = np.min(G.gpm.neighbors.loc[:,"test"]) mx = np.max(G.gpm.neighbors.loc[:,"test"]) expected = ("_gpm","test",mn,mx,0.1,20) assert np.array_equal(expected,G.edge_options["width"]) # Check remove_edge_sizemap G.remove_edge_sizemap() assert G.edge_options["width"] == G._default_edge_width # Make sure it takes in various phenotype, vmin, vmax, size_min, # size_max and does right stuff with them. new_phenos = [np.ones(len(gpm.neighbors)), np.random.random(len(gpm.neighbors))-0.5] for i in range(len(new_phenos)): G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) G.add_gpm(gpm) G.gpm.neighbors.loc[:,"test"] = new_phenos[i] vmin = np.random.random() vmax = vmin + 10 sizemin = np.random.random() sizemax = sizemin + 10 G.add_edge_sizemap(data_column="test",vmin=vmin,vmax=vmax, size_min=sizemin,size_max=sizemax) expected = ("_gpm","test",vmin,vmax,sizemin,sizemax) assert np.array_equal(expected,G.edge_options["width"])
def test_add_remove_edge_cmap(test_data): """ Test both add_edge_cmap and remove_edge_cmap methods. """ G = gpmap.GenotypePhenotypeGraph() with pytest.raises(RuntimeError): G.add_edge_cmap("test") for d in test_data: # Test basic construction/error checking G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) G.add_gpm(gpm) with pytest.raises(KeyError): G.add_edge_cmap(data_column="not_a_column") with pytest.raises(ValueError): G.add_edge_cmap(data_column="weight",cmap="not_a_cmap") G.add_edge_cmap(data_column="weight",cmap="plasma") # Now pass in cmap as cm object G = gpmap.GenotypePhenotypeGraph() G.add_gpm(gpm) cmap = matplotlib.cm.get_cmap("plasma") G.add_edge_cmap(data_column="weight",cmap=cmap) # Now see if it works edge_options = copy.deepcopy(G.edge_options) assert type(edge_options["edge_color"]) is tuple assert edge_options["edge_color"][0] == "_gpm" assert edge_options["edge_color"][1] == "weight" assert type(edge_options["edge_color"][2]) is type(matplotlib.cm.get_cmap("plasma")) assert edge_options["edge_color"][3] == np.min([1]) assert edge_options["edge_color"][4] == np.min([1]) # Now pass in more interesting edge values G = gpmap.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"], phenotype=d["phenotype"]) gpm.get_neighbors() gpm.neighbors.loc[:,"flux"] = np.random.random(len(gpm.neighbors)) G.add_gpm(gpm) G.add_edge_cmap(data_column="flux",cmap="plasma") edge_options = copy.deepcopy(G.edge_options) assert type(edge_options["edge_color"]) is tuple assert edge_options["edge_color"][0] == "_gpm" assert edge_options["edge_color"][1] == "flux" assert type(edge_options["edge_color"][2]) is type(matplotlib.cm.get_cmap("plasma")) assert edge_options["edge_color"][3] == np.min(gpm.neighbors.loc[:,"flux"]) assert edge_options["edge_color"][4] == np.max(gpm.neighbors.loc[:,"flux"]) G.add_edge_cmap(data_column="flux",cmap=cmap,vmin=5,vmax=10) edge_options = copy.deepcopy(G.edge_options) assert type(edge_options["edge_color"]) is tuple assert edge_options["edge_color"][0] == "_gpm" assert edge_options["edge_color"][1] == "flux" assert type(edge_options["edge_color"][2]) is type(matplotlib.cm.get_cmap("plasma")) assert edge_options["edge_color"][3] == 5 assert edge_options["edge_color"][4] == 10 # Now test removal G.remove_edge_cmap() removed_options = ["edge_vmin","edge_vmax","edge_cmap"] for r in removed_options: with pytest.raises(KeyError): G.edge_options[r] assert G.edge_options["edge_color"] == "black"
def test_add_gpm(test_data): # Check for bad value checking G = base.GenotypePhenotypeGraph() bad_values = ["test",[],1.3,(1,2),base.GenotypePhenotypeGraph()] for b in bad_values: with pytest.raises(TypeError): G.add_gpm(b) for d in test_data: G = base.GenotypePhenotypeGraph() assert G.gpm is None # Build with bad edge_weight_column, no neighbors gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) with pytest.raises(ValueError): G.add_gpm(gpm,edge_weight_column="not_yet") # Build with bad edge_weight_column, pre-built neighbors G = base.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) gpm.get_neighbors() gpm.neighbors.loc[:,"now_here"] = np.ones(len(gpm.neighbors)) with pytest.raises(ValueError): G.add_gpm(gpm,edge_weight_column="not_yet") # Add with good edge_weight_column, pre-built neighbors G = base.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) gpm.get_neighbors() gpm.neighbors.loc[:,"now_here"] = np.ones(len(gpm.neighbors)) ret = G.add_gpm(gpm,edge_weight_column="now_here") assert ret.edge_weight_column == "now_here" # No prebuilt neighbors G = base.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) ret = G.add_gpm(gpm) # Test return of self assert G is ret # Make sure neighbors and neighbor weight construted correctly assert G.edge_weight_column == "weight" assert G.gpm.neighbors is not None G.gpm.neighbors.weight # Make sure gpm is now attached (as pointer, not copy) assert G.gpm is gpm # Make sure data loaded into nodes as expected assert len(G.nodes) == len(G.gpm.data) keys = ["genotype","binary","n_mutations","name"] for i in range(len(G.nodes)): for k in keys: assert G.nodes[i][k] == G.gpm.data.iloc[i][k] # Make sure neighbors were generated. (Do not check that the neighbors # are right. For this, check the test_gpm.py tests.) assert G.gpm.neighbors is not None # Make sure edges match neighbors assert len(G.gpm.neighbors) == len(G.edges) for i in range(len(G.gpm.neighbors)): edge = G.gpm.neighbors.edge[i] G.edges[edge] # Build neighbors differently -- make sure this still worked without # generating neighbors on the fly. G = base.GenotypePhenotypeGraph() gpm = gpmap.GenotypePhenotypeMap(d["genotype"]) gpm.get_neighbors("hamming",cutoff=3) pregen_edges = np.copy(gpm.neighbors.edge) G.add_gpm(gpm) assert len(pregen_edges) == len(G.gpm.neighbors.edge) for i in range(len(G.gpm.neighbors)): edge = G.gpm.neighbors.edge[i] G.edges[edge]