def __get_mutation_center(struct_df, label_info, center_at_mut=True): if center_at_mut: # Use CA position of the mutated residue as center for subgrid center sel = ((struct_df.chain == label_info.chain) & (struct_df.residue == label_info.residue) & (struct_df.name == 'CA')) mutation_pos = struct_df[sel][['x', 'y', 'z']].astype(np.float32) mutation_center = util.get_center(mutation_pos) else: pos = struct_df[['x', 'y', 'z']].astype(np.float32) mutation_center = util.get_center(pos) return mutation_center
def get_data_stats(data_filename): """ Get the furthest distance from the ligand's center and the number of atoms for each structure in the dataset. """ data_df = pd.read_hdf(data_filename, 'structures') data = [] for pdbcode, struct_df in data_df.groupby(['structure']): pos = struct_df[['x', 'y', 'z']].astype(np.float32) ligand_pos = struct_df[struct_df.chain == 'LIG'][['x', 'y', 'z' ]].astype(np.float32) ligand_center = util.get_center(ligand_pos) max_dist = util.get_max_distance_from_center(pos, ligand_center) num_atoms = struct_df.shape[0] data.append((pdbcode, max_dist, num_atoms)) df = pd.DataFrame(data, columns=['pdbcode', 'max_dist', 'num_atoms']) df = df.sort_values(by=['max_dist', 'num_atoms'], ascending=[False, False]).reset_index(drop=True) print(df.describe()) print(df[df.max_dist < 20].shape[0] * 100.0 / df.shape[0]) return df
def df_to_feature(struct_df, grid_config, random_seed=None): pos = struct_df[['x', 'y', 'z']].astype(np.float32) center = util.get_center(pos) rot_mat = subgrid_gen.gen_rot_matrix(grid_config, random_seed=random_seed) grid = subgrid_gen.get_grid( struct_df, center, config=grid_config, rot_mat=rot_mat) return grid
def get_data_stats(sharded_list): """ Get the furthest distance from the protein's center and max residue ID for every protein in the sharded dataset. """ data = [] all_elements = [] labels = [] for i, sharded in enumerate(sharded_list): for shard_num, shard_df in sharded.iter_shards(): labels_df = sharded.read_shard(shard_num, key='labels') for ensemble, ensemble_df in shard_df.groupby(['ensemble']): all_elements.extend(ensemble_df.element.values) subunits = ensemble_df.subunit.unique() inactive = __get_subunit_name(subunits, mode='inactive') active = __get_subunit_name(subunits, mode='active') for subunit_name in [inactive, active]: struct_df = ensemble_df[ensemble_df.subunit == subunit_name] pos = struct_df[['x', 'y', 'z']].astype(np.float32) ligand_pos = struct_df[struct_df.chain == 'L'][[ 'x', 'y', 'z' ]].astype(np.float32) ligand_center = util.get_center(ligand_pos) max_dist = util.get_max_distance_from_center( pos, ligand_center) num_atoms = struct_df.shape[0] data.append((ensemble, subunit_name, max_dist, num_atoms)) labels.append((i, shard_num, labels_df[ labels_df.ensemble == ensemble].label.values[0] == 'A')) all_elements_df = pd.DataFrame(all_elements, columns=['element']) unique_elements = all_elements_df.element.unique() print('Unique elements ({:}): {:}'.format(len(unique_elements), unique_elements)) print('\nElement counts:') print(all_elements_df.element.value_counts()) print('\n') all_labels_df = pd.DataFrame(labels, columns=['sharded', 'shard_num', 'label']) print('\nLabel dist by dataset:') print(all_labels_df.groupby(['sharded', 'shard_num']).label.value_counts()) print('\n') df = pd.DataFrame(data, columns=['ensemble', 'subunit', 'max_dist', 'num_atoms']) df = df.sort_values(by=['max_dist', 'num_atoms'], ascending=[False, False]).reset_index(drop=True) print(df.describe()) print(df[df.max_dist < 90].shape[0] * 100.0 / df.shape[0]) return df
def df_to_feature(struct_df, grid_config, random_seed=None): # Use center of ligand for subgrid center ligand_pos = struct_df[struct_df.chain == 'LIG'][['x', 'y', 'z']].astype(np.float32) ligand_center = util.get_center(ligand_pos) rot_mat = subgrid_gen.gen_rot_matrix(grid_config, random_seed=random_seed) grid = subgrid_gen.get_grid(struct_df, ligand_center, config=grid_config, rot_mat=rot_mat) return grid
def get_data_stats(data_filename): """ Get the furthest distance from the molecule's center and the number of atoms for each molecule in the dataset. """ data_df = pd.read_hdf(data_filename, 'structures') data = [] for mol_id, mol_df in data_df.groupby(['structure']): pos = mol_df[['x', 'y', 'z']].astype(np.float32) max_dist = util.get_max_distance_from_center(pos, util.get_center(pos)) num_atoms = mol_df.shape[0] data.append((mol_id, max_dist, num_atoms)) df = pd.DataFrame(data, columns=['mol_id', 'max_dist', 'num_atoms']) df = df.sort_values(by=['max_dist', 'num_atoms'], ascending=[False, False]).reset_index(drop=True) print(df.describe()) print(df[df.max_dist < 7.5].shape[0] * 100.0 / df.shape[0]) return df
def get_data_stats(sharded_list): data = [] for sharded in sharded_list: for _, shard_df in sharded.iter_shards(): for (target, decoy), struct_df in shard_df.groupby(['ensemble', 'subunit']): pos = struct_df[['x', 'y', 'z']].astype(np.float32) max_dist = util.get_max_distance_from_center( pos, util.get_center(pos)) max_res = struct_df.residue.max() data.append((target, decoy, max_dist, max_res)) df = pd.DataFrame(data, columns=['target', 'decoy', 'max_dist', 'max_res']) df = df.sort_values(by=['max_dist', 'max_res'], ascending=[False, False]).reset_index(drop=True) print(df.describe()) print(df[df.max_dist < 50].shape[0] * 100.0 / df.shape[0]) print(df[df.max_dist < 50].target.unique().shape[0] * 100.0 / float(df.target.unique().shape[0])) return df
def get_data_stats(sharded_list): """ Get the furthest distance from the protein's center and max residue ID for every protein in the sharded dataset. """ data = [] for i, sharded in enumerate(sharded_list): for _, shard_df in sharded.iter_shards(): for (target, decoy), struct_df in shard_df.groupby(['ensemble', 'subunit']): pos = struct_df[['x', 'y', 'z']].astype(np.float32) max_dist = util.get_max_distance_from_center( pos, util.get_center(pos)) max_res = struct_df.residue.max() data.append((i, target, decoy, max_dist, max_res)) df = pd.DataFrame(data, columns=['sharded', 'target', 'decoy', 'max_dist', 'max_res']) df = df.sort_values(by=['sharded', 'max_dist', 'max_res'], ascending=[True, False, False]).reset_index(drop=True) print(df.describe()) print(df[df.max_dist < 90].shape[0]*100.0/df.shape[0]) print(df[df.max_dist < 90].target.unique().shape[0]*100.0/float(df.target.unique().shape[0])) return df
def df_to_feature(struct_df, grid_config, center_around_Cs, random_seed=None): # Consider only atoms that have mapping for computing center. # If <center_around_Cs> is set, consider only carbon atoms. if center_around_Cs: pruned_struct_df = struct_df[struct_df.element == 'C'] else: pruned_struct_df = struct_df[struct_df.element.isin( grid_config.element_mapping.keys())] pos = pruned_struct_df[['x', 'y', 'z']].astype(np.float32) # Use center of ligand for subgrid center ligand_pos = pruned_struct_df[pruned_struct_df.chain == 'L'][[ 'x', 'y', 'z' ]].astype(np.float32) ligand_center = util.get_center(ligand_pos) rot_mat = subgrid_gen.gen_rot_matrix(grid_config, random_seed=random_seed) grid = subgrid_gen.get_grid(struct_df, ligand_center, config=grid_config, rot_mat=rot_mat) return grid