def test_node_features(): # Todo this test requires attention # Tests node featurisers for a residue graph: # Amino acid features, ESM embedding, DSSP features, aaindex features file_path = Path(__file__).parent / "test_data/4hhb.pdb" node_feature_functions = { "node_metadata_functions": [ expasy_protein_scale, # Todo we need to refactor node data assingment flow meiler_embedding, # rsa, # asa, # phi, # psi, # secondary_structure, # partial(aaindex1, accession="FAUJ880111"), ] } config = ProteinGraphConfig(**node_feature_functions) G = construct_graph(pdb_path=str(file_path), config=config) # Check for existence of features for n, d in G.nodes(data=True): # assert "meiler_embedding" in d # Todo these functions return pd.Series, rather than adding to the node # assert expasy_protein_scale in d # assert "rsa" in d # assert "asa" in d # assert "phi" in d # assert "psi" in d # assert "secondary_structure" in d continue
def test_insertion_handling(): configs = { "granularity": "CA", "keep_hets": False, "insertions": False, "verbose": False, "node_metadata_functions": [meiler_embedding, expasy_protein_scale], "edge_construction_functions": [ add_peptide_bonds, add_hydrogen_bond_interactions, add_ionic_interactions, add_aromatic_sulphur_interactions, add_hydrophobic_interactions, add_cation_pi_interactions, ], } config = ProteinGraphConfig(**configs) # This is a nasty PDB with a lot of insertions and altlocs g = construct_graph(config=config, pdb_code="6OGE") assert len(g.graph["sequence_A"]) + len(g.graph["sequence_B"]) + len( g.graph["sequence_C"] ) + len(g.graph["sequence_D"]) + len(g.graph["sequence_E"]) == len(g) assert g.graph["coords"].shape[0] == len(g)
def test_sequence_features(): # Tests sequence featurisers for a residue graph: # ESM and BioVec embeddings, propy and sequence descriptors file_path = Path(__file__).parent / "test_data/4hhb.pdb" sequence_feature_functions = { "graph_metadata_functions": [ # esm_sequence_embedding, # esm_residue_embedding, biovec_sequence_embedding, molecular_weight, ] } config = ProteinGraphConfig(**sequence_feature_functions) G = construct_graph(pdb_path=str(file_path), config=config) # Check for existence on sequence-based features as node-level features # for n, d in G.nodes(data=True): # Todo this can probably be improved. # This only checks for the existence and shape of the esm_embedding for each node # assert "esm_embedding" in d # assert len(d["esm_embedding"]) == 1280 # Check for existence of sequence-based features as Graph-level features for chain in G.graph["chain_ids"]: assert f"sequence_{chain}" in G.graph # assert f"esm_embedding_{chain}" in G.graph assert f"biovec_embedding_{chain}" in G.graph assert f"molecular_weight_{chain}" in G.graph
def test_extract_subgraph_from_bond_type(): """Tests subgraph extraction from bond type""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( edge_construction_functions=[add_peptide_bonds, add_ionic_interactions] ) G = construct_graph(pdb_path=str(file_path)) # , config=config) BOND_TYPES = ["ionic"] s_g = extract_subgraph_by_bond_type(G, BOND_TYPES, filter_dataframe=True) for u, v, d in G.edges(data=True): if d["kind"] in BOND_TYPES: assert u in s_g.nodes() assert v in s_g.nodes() assert (u, v) in s_g.edges() for u, v, d in s_g.edges(data=True): for bond in list(d["kind"]): assert bond in BOND_TYPES s_g = extract_subgraph_by_bond_type( G, BOND_TYPES, filter_dataframe=True, inverse=True ) for u, v, d in G.edges(data=True): if d["kind"] in BOND_TYPES: assert (u, v) not in s_g.edges() for u, v, d in s_g.edges(data=True): for bond in list(d["kind"]): assert bond not in BOND_TYPES
def test_extract_subgraph_from_chains(): """Tests subgraph extraction from chains.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) CHAINS = ["A", "C"] s_g = extract_subgraph_from_chains(G, CHAINS, filter_dataframe=True) # Test we only selected the correct chains for n, d in s_g.nodes(data=True): assert d["chain_id"] in CHAINS # Test we have extracted all the nodes for n, d in G.nodes(data=True): if d["chain_id"] in CHAINS: assert n in s_g.nodes() # Test the dataframe is correct assert s_g.graph["pdb_df"]["chain_id"].isin(CHAINS).all() s_g = extract_subgraph_from_chains( G, CHAINS, filter_dataframe=True, inverse=True ) # Test we only selected the correct chains for n, d in s_g.nodes(data=True): assert d["chain_id"] not in CHAINS # Test we have extracted all the nodes for n, d in G.nodes(data=True): if d["chain_id"] in CHAINS: assert n not in s_g.nodes()
def test_extract_subgraph_from_sequence_position(): """Tests subgraph extraction from sequence position.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) SEQ_POS = list(range(1, 50, 2)) s_g = extract_subgraph_by_sequence_position( G, SEQ_POS, filter_dataframe=True, ) # Test we only selected the correct chains for n, d in s_g.nodes(data=True): assert d["residue_number"] in SEQ_POS # Test we have extracted all the nodes for n, d in G.nodes(data=True): if d["residue_number"] in SEQ_POS: assert n in s_g.nodes() # Test the dataframe is correct assert s_g.graph["pdb_df"]["residue_number"].isin(SEQ_POS).all() s_g = extract_subgraph_by_sequence_position( G, SEQ_POS, filter_dataframe=True, inverse=True ) # Test we only selected the correct chains for n, d in s_g.nodes(data=True): assert d["residue_number"] not in SEQ_POS # Test we have extracted all the nodes for n, d in G.nodes(data=True): if d["residue_number"] in SEQ_POS: assert n not in s_g.nodes()
def test_extract_subgraph_from_point(): """Tests subgraph extraction from a spherical selection.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) POINT = np.array([0.0, 0.0, 0.0]) RADIUS = 10 s_g = extract_subgraph_from_point(G, POINT, RADIUS, filter_dataframe=True) # Check all nodes are within the sphere for n, d in s_g.nodes(data=True): assert np.linalg.norm(d["coords"] - POINT) < RADIUS # Check we have extracted all the nodes for n, d in G.nodes(data=True): if np.linalg.norm(d["coords"] - POINT) < RADIUS: assert n in s_g.nodes() s_g = extract_subgraph_from_point( G, POINT, RADIUS, filter_dataframe=True, inverse=True ) # Check all nodes are not within the sphere for n, d in s_g.nodes(data=True): assert np.linalg.norm(d["coords"] - POINT) > RADIUS # Check we have extracted all the nodes for n, d in G.nodes(data=True): if np.linalg.norm(d["coords"] - POINT) > RADIUS: assert n in s_g.nodes()
def test_distance_edges(): """Example-based test that distance-based edge construction works correctly Uses 4hhb PDB file as an example test case. """ file_path = Path(__file__).parent / "test_data/4hhb.pdb" edge_functions = { "edge_construction_functions": [ partial(add_k_nn_edges, k=5, long_interaction_threshold=10), add_hydrophobic_interactions, add_aromatic_interactions, # Todo removed for now as ring centroids require precomputing add_aromatic_sulphur_interactions, add_delaunay_triangulation, add_cation_pi_interactions, add_peptide_bonds, add_hydrogen_bond_interactions, add_disulfide_interactions, add_ionic_interactions, partial( add_distance_threshold, threshold=12, long_interaction_threshold=10, ), ] } config = ProteinGraphConfig(**edge_functions) G = construct_graph(pdb_path=str(file_path), config=config) assert G is not None
def test_extract_subgraph_from_atom_types(): """Tests subgraph extraction from a list of allowed atom types""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) ATOM_TYPES = ["CA"] g = extract_subgraph_from_atom_types(G, ATOM_TYPES, filter_dataframe=True) assert isinstance(g, nx.Graph) assert len(g) == len(G)
def test_save_graph_to_pdb(): g = construct_graph(pdb_code="4hhb") save_graph_to_pdb(g, "/tmp/test_graph.pdb") a = read_pdb_to_dataframe("/tmp/test_graph.pdb").df["ATOM"] # Check file exists assert os.path.isfile("/tmp/test_graph.pdb") # Check for equivalence between saved and existing DFs. # We drop the line_idx columns as these will be renumbered assert_frame_equal( a.drop(["line_idx"], axis=1), g.graph["pdb_df"].drop(["line_idx"], axis=1), ) h = construct_graph(pdb_path="/tmp/test_graph.pdb") # We check for isomorphism rather than equality as array features are not comparable assert nx.is_isomorphic(g, h)
def test_amino_acid_one_hot_example(): """Example-based test on 4hhb for `amino_acid_onehot`.""" # Test np array config = ProteinGraphConfig(node_metadata_functions=[amino_acid_one_hot]) g = construct_graph(pdb_code="4hhb", config=config) for n, d in g.nodes(data=True): assert sum(d["amino_acid_one_hot"]) == 1 # Test pd.Series config = ProteinGraphConfig(node_metadata_functions=[ partial(amino_acid_one_hot, return_array=False) ]) g = construct_graph(pdb_code="4hhb", config=config) for n, d in g.nodes(data=True): assert sum(d["amino_acid_one_hot"]) == 1 assert (d["amino_acid_one_hot"].idxmax() == RESI_THREE_TO_1[ d["residue_name"]])
def test_chain_selection(): """Example-based test that chain selection works correctly. Uses 4hhb PDB file as an example test case. """ file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) # Check default construction contains all chains assert G.graph["chain_ids"] == ["A", "B", "C", "D"] # Check nodes contain residues from chains for n, d in G.nodes(data=True): assert d["chain_id"] in ["A", "B", "C", "D"] # Check graph contains only chain selection G = construct_graph(pdb_path=str(file_path), chain_selection="AD") assert G.graph["chain_ids"] == ["A", "D"] # Check nodes only contain residues from chain selection for n, d in G.nodes(data=True): assert d["chain_id"] in ["A", "D"]
def test_edges_do_not_add_nodes_for_chain_subset(): new_funcs = { "edge_construction_functions": [ add_peptide_bonds, add_hydrogen_bond_interactions, add_disulfide_interactions, add_ionic_interactions, add_aromatic_interactions, add_aromatic_sulphur_interactions, add_cation_pi_interactions, ], } config = ProteinGraphConfig(**new_funcs) g = construct_graph(config=config, pdb_code="2vvi", chain_selection="A") assert len(g) == 217 g = construct_graph(config=config, pdb_code="2vvi", chain_selection="B") assert len(g) == 219 g = construct_graph(config=config, pdb_code="2vvi", chain_selection="C") assert len(g) == 222 g = construct_graph(config=config, pdb_code="2vvi", chain_selection="D") assert len(g) == 219
def test_add_beta_carbon_vector(): config = ProteinGraphConfig(edge_construction_functions=[ partial(add_beta_carbon_vector, scale=True) ], ) g = construct_graph(pdb_code="1lds", config=config) raw_pdb = g.graph["raw_pdb_df"] for n, d in g.nodes(data=True): # Check that the node has the correct attributes assert "c_beta_vector" in d.keys() # Check the vector is of the correct dimensionality assert d["c_beta_vector"].shape == (3, ) # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["c_beta_vector"], np.array([0.0, 0.0, 0.0])) else: # Check scaled vector has norm close 1 np.testing.assert_almost_equal(np.linalg.norm(d["c_beta_vector"]), 1.0) # Test unscaled vector config = ProteinGraphConfig(edge_construction_functions=[ partial(add_beta_carbon_vector, scale=False) ], ) g = construct_graph(pdb_code="1lds", config=config) for n, d in g.nodes(data=True): # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["c_beta_vector"], np.array([0.0, 0.0, 0.0])) else: # Check the vector is pointing in the correct direction cb_true = np.array( raw_pdb[raw_pdb["node_id"] == n][raw_pdb["atom_name"] == "CB"][ ["x_coord", "y_coord", "z_coord"]]).T.squeeze() np.testing.assert_almost_equal(cb_true, d["coords"] + d["c_beta_vector"])
def test_add_sidechain_vector(): config = ProteinGraphConfig(edge_construction_functions=[ partial(add_sidechain_vector, scale=True) ], ) g = construct_graph(pdb_code="1lds", config=config) for n, d in g.nodes(data=True): # Check that the node has the correct attributes assert "sidechain_vector" in d.keys() # Check the vector is of the correct dimensionality assert d["sidechain_vector"].shape == (3, ) # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["sidechain_vector"], np.array([0.0, 0.0, 0.0])) else: # Check scaled vector has norm close 1 np.testing.assert_almost_equal( np.linalg.norm(d["sidechain_vector"]), 1.0) # Test unscaled vector config = ProteinGraphConfig(edge_construction_functions=[ partial(add_sidechain_vector, scale=False) ], ) g = construct_graph(pdb_code="1lds", config=config) for n, d in g.nodes(data=True): # check glycines are zero if d["residue_name"] == "GLY": np.testing.assert_equal(d["sidechain_vector"], np.array([0.0, 0.0, 0.0])) else: # Check the vector is pointing in the correct direction sc_true = np.array( g.graph["rgroup_df"].groupby("node_id").mean().loc[n][[ "x_coord", "y_coord", "z_coord" ]]) np.testing.assert_almost_equal(sc_true, d["coords"] + d["sidechain_vector"])
def test_save_rgroup_df_to_pdb(): g = construct_graph(pdb_code="4hhb") save_rgroup_df_to_pdb(g, "/tmp/test_rgroup.pdb") a = read_pdb_to_dataframe("/tmp/test_rgroup.pdb").df["ATOM"] # Check file exists assert os.path.isfile("/tmp/test_rgroup.pdb") # We drop the line_idx columns as these will be renumbered assert_frame_equal( a.drop(["line_idx"], axis=1), g.graph["rgroup_df"].drop(["line_idx"], axis=1), )
def test_construct_graph(): """Example-based test that graph construction works correctly. Uses 4hhb PDB file as an example test case. """ file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) assert isinstance(G, nx.Graph) assert len(G) == 574 # Check number of peptide bonds peptide_bond_edges = [(u, v) for u, v, d in G.edges(data=True) if d["kind"] == {"peptide_bond"}] assert len(peptide_bond_edges) == 570
def test_extract_k_hop_subgraph(): """Tests k-hop subgraph extraction.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" G = construct_graph(pdb_path=str(file_path)) CENTRAL_NODE = "B:SER:49" K = 1 s_g = extract_k_hop_subgraph(G, CENTRAL_NODE, K, filter_dataframe=True) for n in s_g.nodes(): if n != CENTRAL_NODE: assert n in list(G.neighbors(CENTRAL_NODE)) for n in list(G.neighbors(CENTRAL_NODE)): assert n in s_g.nodes()
def test_surface_subgraph(): """Tests surface subgraph extraction.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( graph_metadata_functions=[rsa], dssp_config=DSSPConfig() ) G = construct_graph(pdb_path=str(file_path), config=config) RSA_THRESHOLD: float = 0.2 s_g = extract_surface_subgraph(G, RSA_THRESHOLD, filter_dataframe=True) for n, d in s_g.nodes(data=True): assert d["rsa"] >= RSA_THRESHOLD for n, d in G.nodes(data=True): if d["rsa"] >= RSA_THRESHOLD: assert n in s_g.nodes(), print(n, d)
def test_node_list_subgraphing(): """Tests subgraph extraction from a list of nodes.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" NODE_LIST = ["C:ALA:28", "C:ARG:31", "D:LEU:75", "A:THR:38"] G = construct_graph(pdb_path=str(file_path)) g = extract_subgraph_from_node_list(G, NODE_LIST, filter_dataframe=True) # Check we get back a graph and it contains the correct nodes assert isinstance(g, nx.Graph) assert len(g) == len(NODE_LIST) for n in g.nodes(): assert n in NODE_LIST assert ( g.graph["pdb_df"]["node_id"] .str.contains("|".join(NODE_LIST), case=True) .all() ) # Check the list of nodes is the same as the list of nodes in the original graph returned_node_list = extract_subgraph_from_node_list( G, NODE_LIST, return_node_list=True ) assert all(elem in NODE_LIST for elem in returned_node_list) # Check there is no overlap when we inverse the selection g = extract_subgraph_from_node_list( G, NODE_LIST, inverse=True, filter_dataframe=True ) assert len(g) == len(G) - len(NODE_LIST) for n in g.nodes(): assert n not in NODE_LIST assert not ( g.graph["pdb_df"]["node_id"] .str.contains("|".join(NODE_LIST), case=True) .any() ) returned_node_list = extract_subgraph_from_node_list( G, NODE_LIST, inverse=True, return_node_list=True ) assert all(elem not in NODE_LIST for elem in returned_node_list)
def main(config_path, pdb_path, output_path): """Build the graphs and save them in output dir.""" config = None if config_path: config = parse_config(path=config_path) if pdb_path.is_file(): pdb_paths = [pdb_path] elif pdb_path.is_dir(): pdb_paths = [pdb for pdb in pdb_path.glob("*.pdb")] else: raise NotImplementedError( "Given PDB path needs to point to either a pdb file or a directory with pdb files." ) for path in pdb_paths: g = construct_graph(config=config, pdb_path=str(path)) nx.write_gpickle(g, str(output_path / f"{path.stem}.pickle"))
def test_secondary_structure_subgraph(): """Tests secondary subgraph extraction.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( graph_metadata_functions=[secondary_structure], dssp_config=DSSPConfig(), ) G = construct_graph(pdb_path=str(file_path), config=config) SS_ELEMENTS: List[str] = ["H"] s_g = extract_subgraph_from_secondary_structure( G, SS_ELEMENTS, filter_dataframe=True ) for _, d in s_g.nodes(data=True): assert d["ss"] in SS_ELEMENTS for n, d in G.nodes(data=True): if d["ss"] in SS_ELEMENTS: assert n in s_g.nodes()
def test_successful_pickle(): """Tests subgraphs can be successfully pickled and unpickled""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" config = ProteinGraphConfig( graph_metadata_functions=[secondary_structure], dssp_config=DSSPConfig(), ) G = construct_graph(pdb_path=str(file_path), config=config) s_g = extract_subgraph_from_residue_types( G, residue_types=["ALA", "SER", "MET"], update_coords=True, filter_dataframe=True, recompute_distmat=True, ) with open("/tmp/test_graph.p", "wb") as f: pickle.dump(s_g, f) with open("/tmp/test_graph.p", "rb") as f: loaded_graph = pickle.load(f) assert nx.is_isomorphic(s_g, loaded_graph)
def test_extract_subgraph_from_residue_types(): """Tests subgraph extraction from a list of nodes.""" file_path = Path(__file__).parent / "test_data/4hhb.pdb" RESIDUE_TYPES = ["ALA", "SER", "GLY"] ALANINES = 72 SERINES = 32 GLYCINES = 40 G = construct_graph(pdb_path=str(file_path)) g = extract_subgraph_from_residue_types( G, RESIDUE_TYPES, filter_dataframe=True ) # Check we get back a graph and it contains the correct nodes assert isinstance(g, nx.Graph) assert len(g) == ALANINES + SERINES + GLYCINES for n, d in g.nodes(data=True): assert d["residue_name"] in RESIDUE_TYPES assert ( g.graph["pdb_df"]["residue_name"] .str.contains("|".join(RESIDUE_TYPES), case=True) .all() ) assert ( len([n for n, d in g.nodes(data=True) if d["residue_name"] == "ALA"]) == ALANINES ) assert ( len([n for n, d in g.nodes(data=True) if d["residue_name"] == "GLY"]) == GLYCINES ) assert ( len([n for n, d in g.nodes(data=True) if d["residue_name"] == "SER"]) == SERINES ) # Check the list of nodes is the same as the list of nodes in the original graph returned_node_list = extract_subgraph_from_node_list( G, RESIDUE_TYPES, return_node_list=True ) assert all(elem in RESIDUE_TYPES for elem in returned_node_list) # Check there is no overlap when we inverse the selection g = extract_subgraph_from_residue_types( G, RESIDUE_TYPES, inverse=True, filter_dataframe=True ) # assert len(g) == (len(G) - GLYCINES - ALANINES - SERINES) for n in g.nodes(): assert n not in RESIDUE_TYPES assert not ( g.graph["pdb_df"]["residue_name"] .str.contains("|".join(RESIDUE_TYPES), case=True) .any() ) returned_node_list = extract_subgraph_from_residue_types( G, RESIDUE_TYPES, inverse=True, return_node_list=True ) assert all(elem not in RESIDUE_TYPES for elem in returned_node_list)
"granularity": "atom", "keep_hets": False, "deprotonate": True, "insertions": False, "verbose": False, } config = ProteinGraphConfig(**configs) config.edge_construction_functions = [ add_atomic_edges, add_ring_status, add_bond_order, ] config.node_metadata_functions = [meiler_embedding, expasy_protein_scale] g = construct_graph( config=config, pdb_path="../examples/pdbs/3eiy.pdb", pdb_code="3eiy" ) p = plotly_protein_structure_graph( g, 30, (1000, 2000), colour_nodes_by="element_symbol", colour_edges_by="kind", label_node_ids=False, ) p.show()
def generate_graph(): """Generate PDB network. This is a helper function. """ return construct_graph(pdb_path=str(DATA_PATH))
} config = ProteinGraphConfig(**configs) config.edge_construction_functions = [ salt_bridge, hydrogen_bond, van_der_waals, pi_cation, pi_stacking, hydrophobic, t_stacking, ] # Test High-level API # Iterate over rows to produce Graph, pickle graph and label for row in tqdm(range(len(df))): example = df.iloc[row] file_path = f'pdbs/{example["Free PDB"]}.pdb' contact_file = f'contacts/{example["Free PDB"]}_contacts.tsv' g = construct_graph(config=config, pdb_code=example["Free PDB"]) print(g) print("Successfully computed all graphs") # Example Run: # python make_rearrangement_data.py -o 'none' -n 'meiler' -s True -c '/home/arj39/Documents/github/getcontacts' # python make_rearrangement_data.py -o 'none' -n 'meiler' -s True -c '/Users/arianjamasb/github/getcontacts'
for i, (_, _, feat_dict) in enumerate(G.edges(data=True)): for key, value in feat_dict.items(): data[str(key)] = (list(value) if i == 0 else data[str(key)] + list(value)) # Add graph-level features for feat_name in G.graph: data[str(feat_name)] = [G.graph[feat_name]] data["edge_index"] = edge_index.view(2, -1) data = Data.from_dict(data) data.num_nodes = G.number_of_nodes() return data if __name__ == "__main__": from graphein.protein.config import ProteinGraphConfig from graphein.protein.graphs import construct_graph g = construct_graph(pdb_code="3eiy", config=ProteinGraphConfig()) assert type(g) is nx.Graph # print(SUPPORTED_FORMATS) convertor = GraphFormatConvertor(src_format="nx", dst_format="pyg", verbose="gnn") pyg = convertor(g) assert type(pyg) is torch_geometric.data.Data