示例#1
0
def _get_all_chain_pairs(complex, df, nb_fn, filename, full):
    """Get all possible chain pairs from provided dataframe."""

    pairs = []
    # We reset the index here so each's chain dataframe can be treated
    # independently.
    groups = [(x[0], x[1].reset_index(drop=True))
              for x in df.groupby(['chain', 'model'])]
    num_chains = len(groups)
    num_pairs = 0
    pair_idx = 0
    for i in range(num_chains):
        (chain0, df0) = groups[i]
        for j in range(i + 1, num_chains):
            (chain1, df1) = groups[j]
            res0, res1 = nb_fn(df0, df1)
            if len(res0) == 0:
                # No neighbors between these 2 chains.
                continue
            else:
                num_pairs += 1
            pos0 = struct.get_ca_pos_from_residues(df0, res0)
            pos1 = struct.get_ca_pos_from_residues(df1, res1)
            pos_idx, neg_idx = _get_positions(df0, pos0, df1, pos1, full)
            srcs = {'src0': filename, 'src1': filename}
            pair = Pair(complex=complex.name, df0=df0, df1=df1,
                        pos_idx=pos_idx, neg_idx=neg_idx, srcs=srcs,
                        id=pair_idx)
            pairs.append(pair)
            pair_idx += 1
    return pairs, num_chains
示例#2
0
def _get_db5_pairs(complex, unbound, nb_fn, full):
    """
    Get pairs for docking benchmark 5 type complex.

    For this type of complex, we assume that each file is its own entity,
    and that there is essentially one pair for each complex, with one side
    being all the chains of the ligand, and the other all the chains of the
    receptor.
    """
    (lb, rb) = complex.bound_filenames
    (lu, ru) = complex.unbound_filenames
    lb_df = pd.read_pickle(lb)
    rb_df = pd.read_pickle(rb)
    # Always use bound to get neighbors...
    lres, rres = nb_fn(lb_df, rb_df)
    if unbound:
        # ...but if unbound, we then use the actual atoms from unbound.
        ldf, rdf = pd.read_pickle(lu), pd.read_pickle(ru)

        # Convert residues' pdb_names to unbound.
        lres['pdb_name'] = lres['pdb_name'].map(
            lambda x: ca.find_of_type(
                x, ldf['pdb_name'].as_matrix(), None, False, style='db5'))
        rres['pdb_name'] = rres['pdb_name'].map(
            lambda x: ca.find_of_type(
                x, rdf['pdb_name'].as_matrix(), None, False, style='db5'))

        # Remove residues that we cannot map from bound structure to unbound.
        rres_index = rres[['pdb_name', 'model', 'chain', 'residue']]
        lres_index = lres[['pdb_name', 'model', 'chain', 'residue']]
        rdf_index = rdf[['pdb_name', 'model', 'chain', 'residue']]
        ldf_index = ldf[['pdb_name', 'model', 'chain', 'residue']]
        rgone = [i for i, x in rres_index.iterrows()
                 if not (np.array(x) == rdf_index).all(1).any()]
        lgone = [i for i, x in lres_index.iterrows()
                 if not (np.array(x) == ldf_index).all(1).any()]
        gone = list(set(lgone).union(set(rgone)))
        if len(gone) > 0:
            logging.warning(
                "Dropping {:}/{:} residues from {:} that didn't map "
                "to unbound from bound."
                .format(len(gone), len(lres), complex.name))
            lres = lres.drop(gone)
            rres = rres.drop(gone)

        lsrc, rsrc = lu, ru
    else:
        ldf, rdf = lb_df, rb_df
        lsrc, rsrc = lb, rb
    lpos = struct.get_ca_pos_from_residues(ldf, lres)
    rpos = struct.get_ca_pos_from_residues(rdf, rres)
    pos_idx, neg_idx = _get_positions(ldf, lpos, rdf, rpos, full)
    srcs = {'src0': lsrc, 'src1': rsrc}
    pair = Pair(complex=complex.name, df0=ldf, df1=rdf, pos_idx=pos_idx,
                neg_idx=neg_idx, srcs=srcs, id=0)
    return [pair], 2
示例#3
0
def _get_evcoupling_pairs(complex, unbound, nb_fn, full):
    """
    Get pairs for EVCoupling type complex.

    For this type of complex, we assume that each chain is its own entity,
    and that two chains form a pair if at least one pair of residues spanning
    the two are considered neighbors.
    """
    (lb, rb) = complex.bound_filenames
    lb_df = pd.read_pickle(lb)
    rb_df = pd.read_pickle(rb)
    # Always use bound to get neighbors...
    lres, rres = nb_fn(lb_df, rb_df)
    ldf, rdf = lb_df, rb_df
    lsrc, rsrc = lb, rb
    lpos = get_ca_pos_from_residues(ldf, lres)
    rpos = get_ca_pos_from_residues(rdf, rres)
    pos_idx, neg_idx = _get_residue_positions(ldf, lpos, rdf, rpos, full)
    srcs = {'src0': lsrc, 'src1': rsrc}
    pair = Pair(complex=complex.name, df0=ldf, df1=rdf, pos_idx=pos_idx, neg_idx=neg_idx, srcs=srcs, id=0, sequences={})
    return [pair], 2
示例#4
0
def postprocess_pruned_pair(raw_pdb_filename, original_pair, neighbor_def,
                            cutoff):
    """
    Construct a new Pair consisting of the
    carbon alpha (CA) atoms of structures
    with DSSP-derivable features and append
    DSSP secondary structure (SS) features
    to each protein structure dataframe as well.
    """
    # Extract secondary structure (SS) and accessible surface area (ASA) values for each PDB file using DSSP.
    dssp_dict = get_dssp_dict_for_pdb_file(raw_pdb_filename)

    # Add SS and RSA values to the atoms in the first dataframe, df0, of a pair of dataframes.
    df0 = original_pair.df0[original_pair.df0['atom_name'].apply(
        lambda x: x == 'CA')]
    df0_ss_values = []
    df0_rsa_values = []
    for index, row in df0.iterrows():
        df0_ss_values += get_dssp_value_for_residue(dssp_dict, 'SS', row.chain,
                                                    int(row.residue),
                                                    row.resname)
    for index, row in df0.iterrows():
        df0_rsa_values.append(
            get_dssp_value_for_residue(dssp_dict, 'RSA', row.chain,
                                       int(row.residue), row.resname))

    df0.insert(5, 'ss_value', df0_ss_values, False)
    df0.insert(6, 'rsa_value', df0_rsa_values, False)

    # Add SS and RSA values to the atoms in the second dataframe, df1, of a pair of dataframes.
    df1 = original_pair.df1[original_pair.df1['atom_name'].apply(
        lambda x: x == 'CA')]
    df1_ss_values = []
    df1_rsa_values = []
    for index, row in df1.iterrows():
        df1_ss_values += get_dssp_value_for_residue(dssp_dict, 'SS', row.chain,
                                                    int(row.residue),
                                                    row.resname)
    for index, row in df1.iterrows():
        df1_rsa_values.append(
            get_dssp_value_for_residue(dssp_dict, 'RSA', row.chain,
                                       int(row.residue), row.resname))

    df1.insert(5, 'ss_value', df1_ss_values, False)
    df1.insert(6, 'rsa_value', df1_rsa_values, False)
    """
    Calculate the region of a given protein interface by
    deriving neighboring atoms in a protein complex along
    with their respective coordinates.
    """
    get_neighbors = nb.build_get_neighbors(neighbor_def, cutoff)
    res0, res1 = get_neighbors(df0, df1)
    pos0 = struct.get_ca_pos_from_residues(df0, res0)
    pos1 = struct.get_ca_pos_from_residues(df1, res1)
    pos_idx, neg_idx = pa._get_positions(df0, pos0, df1, pos1, False)

    # Reconstruct a Pair representing a complex of interacting proteins
    pair = Pair(complex=original_pair.complex,
                df0=df0,
                df1=df1,
                pos_idx=pos_idx,
                neg_idx=neg_idx,
                srcs=original_pair.srcs,
                id=original_pair.id)
    return pair, df0, df1