def _get_all_chain_pairs(complex, df, nb_fn, filename, full): """Get all possible chain pairs from provided dataframe.""" pairs = [] # We reset the index here so each's chain dataframe can be treated # independently. groups = [(x[0], x[1].reset_index(drop=True)) for x in df.groupby(['chain', 'model'])] num_chains = len(groups) num_pairs = 0 pair_idx = 0 for i in range(num_chains): (chain0, df0) = groups[i] for j in range(i + 1, num_chains): (chain1, df1) = groups[j] res0, res1 = nb_fn(df0, df1) if len(res0) == 0: # No neighbors between these 2 chains. continue else: num_pairs += 1 pos0 = struct.get_ca_pos_from_residues(df0, res0) pos1 = struct.get_ca_pos_from_residues(df1, res1) pos_idx, neg_idx = _get_positions(df0, pos0, df1, pos1, full) srcs = {'src0': filename, 'src1': filename} pair = Pair(complex=complex.name, df0=df0, df1=df1, pos_idx=pos_idx, neg_idx=neg_idx, srcs=srcs, id=pair_idx) pairs.append(pair) pair_idx += 1 return pairs, num_chains
def _get_db5_pairs(complex, unbound, nb_fn, full): """ Get pairs for docking benchmark 5 type complex. For this type of complex, we assume that each file is its own entity, and that there is essentially one pair for each complex, with one side being all the chains of the ligand, and the other all the chains of the receptor. """ (lb, rb) = complex.bound_filenames (lu, ru) = complex.unbound_filenames lb_df = pd.read_pickle(lb) rb_df = pd.read_pickle(rb) # Always use bound to get neighbors... lres, rres = nb_fn(lb_df, rb_df) if unbound: # ...but if unbound, we then use the actual atoms from unbound. ldf, rdf = pd.read_pickle(lu), pd.read_pickle(ru) # Convert residues' pdb_names to unbound. lres['pdb_name'] = lres['pdb_name'].map( lambda x: ca.find_of_type( x, ldf['pdb_name'].as_matrix(), None, False, style='db5')) rres['pdb_name'] = rres['pdb_name'].map( lambda x: ca.find_of_type( x, rdf['pdb_name'].as_matrix(), None, False, style='db5')) # Remove residues that we cannot map from bound structure to unbound. rres_index = rres[['pdb_name', 'model', 'chain', 'residue']] lres_index = lres[['pdb_name', 'model', 'chain', 'residue']] rdf_index = rdf[['pdb_name', 'model', 'chain', 'residue']] ldf_index = ldf[['pdb_name', 'model', 'chain', 'residue']] rgone = [i for i, x in rres_index.iterrows() if not (np.array(x) == rdf_index).all(1).any()] lgone = [i for i, x in lres_index.iterrows() if not (np.array(x) == ldf_index).all(1).any()] gone = list(set(lgone).union(set(rgone))) if len(gone) > 0: logging.warning( "Dropping {:}/{:} residues from {:} that didn't map " "to unbound from bound." .format(len(gone), len(lres), complex.name)) lres = lres.drop(gone) rres = rres.drop(gone) lsrc, rsrc = lu, ru else: ldf, rdf = lb_df, rb_df lsrc, rsrc = lb, rb lpos = struct.get_ca_pos_from_residues(ldf, lres) rpos = struct.get_ca_pos_from_residues(rdf, rres) pos_idx, neg_idx = _get_positions(ldf, lpos, rdf, rpos, full) srcs = {'src0': lsrc, 'src1': rsrc} pair = Pair(complex=complex.name, df0=ldf, df1=rdf, pos_idx=pos_idx, neg_idx=neg_idx, srcs=srcs, id=0) return [pair], 2
def _get_evcoupling_pairs(complex, unbound, nb_fn, full): """ Get pairs for EVCoupling type complex. For this type of complex, we assume that each chain is its own entity, and that two chains form a pair if at least one pair of residues spanning the two are considered neighbors. """ (lb, rb) = complex.bound_filenames lb_df = pd.read_pickle(lb) rb_df = pd.read_pickle(rb) # Always use bound to get neighbors... lres, rres = nb_fn(lb_df, rb_df) ldf, rdf = lb_df, rb_df lsrc, rsrc = lb, rb lpos = get_ca_pos_from_residues(ldf, lres) rpos = get_ca_pos_from_residues(rdf, rres) pos_idx, neg_idx = _get_residue_positions(ldf, lpos, rdf, rpos, full) srcs = {'src0': lsrc, 'src1': rsrc} pair = Pair(complex=complex.name, df0=ldf, df1=rdf, pos_idx=pos_idx, neg_idx=neg_idx, srcs=srcs, id=0, sequences={}) return [pair], 2
def postprocess_pruned_pair(raw_pdb_filename, original_pair, neighbor_def, cutoff): """ Construct a new Pair consisting of the carbon alpha (CA) atoms of structures with DSSP-derivable features and append DSSP secondary structure (SS) features to each protein structure dataframe as well. """ # Extract secondary structure (SS) and accessible surface area (ASA) values for each PDB file using DSSP. dssp_dict = get_dssp_dict_for_pdb_file(raw_pdb_filename) # Add SS and RSA values to the atoms in the first dataframe, df0, of a pair of dataframes. df0 = original_pair.df0[original_pair.df0['atom_name'].apply( lambda x: x == 'CA')] df0_ss_values = [] df0_rsa_values = [] for index, row in df0.iterrows(): df0_ss_values += get_dssp_value_for_residue(dssp_dict, 'SS', row.chain, int(row.residue), row.resname) for index, row in df0.iterrows(): df0_rsa_values.append( get_dssp_value_for_residue(dssp_dict, 'RSA', row.chain, int(row.residue), row.resname)) df0.insert(5, 'ss_value', df0_ss_values, False) df0.insert(6, 'rsa_value', df0_rsa_values, False) # Add SS and RSA values to the atoms in the second dataframe, df1, of a pair of dataframes. df1 = original_pair.df1[original_pair.df1['atom_name'].apply( lambda x: x == 'CA')] df1_ss_values = [] df1_rsa_values = [] for index, row in df1.iterrows(): df1_ss_values += get_dssp_value_for_residue(dssp_dict, 'SS', row.chain, int(row.residue), row.resname) for index, row in df1.iterrows(): df1_rsa_values.append( get_dssp_value_for_residue(dssp_dict, 'RSA', row.chain, int(row.residue), row.resname)) df1.insert(5, 'ss_value', df1_ss_values, False) df1.insert(6, 'rsa_value', df1_rsa_values, False) """ Calculate the region of a given protein interface by deriving neighboring atoms in a protein complex along with their respective coordinates. """ get_neighbors = nb.build_get_neighbors(neighbor_def, cutoff) res0, res1 = get_neighbors(df0, df1) pos0 = struct.get_ca_pos_from_residues(df0, res0) pos1 = struct.get_ca_pos_from_residues(df1, res1) pos_idx, neg_idx = pa._get_positions(df0, pos0, df1, pos1, False) # Reconstruct a Pair representing a complex of interacting proteins pair = Pair(complex=original_pair.complex, df0=df0, df1=df1, pos_idx=pos_idx, neg_idx=neg_idx, srcs=original_pair.srcs, id=original_pair.id) return pair, df0, df1