ku_dna_file = biotite.temp_file("ku_dna.cif") ku_file = biotite.temp_file("ku.cif") # Download and parse structure files file = rcsb.fetch("1JEY", "mmtf", biotite.temp_dir()) ku_dna = strucio.load_structure(file) file = rcsb.fetch("1JEQ", "mmtf", biotite.temp_dir()) ku = strucio.load_structure(file) # Remove DNA and water ku_dna = ku_dna[(ku_dna.chain_id == "A") | (ku_dna.chain_id == "B")] ku_dna = ku_dna[~struc.filter_solvent(ku_dna)] ku = ku[~struc.filter_solvent(ku)] # The structures have a differing amount of atoms missing # at the the start and end of the structure # -> Find common structure ku_dna_common = ku_dna[struc.filter_intersection(ku_dna, ku)] ku_common = ku[struc.filter_intersection(ku, ku_dna)] # Superimpose ku_superimposed, transformation = struc.superimpose( ku_dna_common, ku_common, (ku_common.atom_name == "CA")) # We do not want the cropped structures # -> apply superimposition on structures before intersection filtering ku_superimposed = struc.superimpose_apply(ku, transformation) # Write PDBx files as input for PyMOL cif_file = pdbx.PDBxFile() pdbx.set_structure(cif_file, ku_dna, data_block="ku_dna") cif_file.write(ku_dna_file) cif_file = pdbx.PDBxFile() pdbx.set_structure(cif_file, ku_superimposed, data_block="ku") cif_file.write(ku_file) # Visualization with PyMOL...
def extractInterface(dimer, c1, c2, vdwC, minC, nearC, mem): """ This function extracts interfaces from provided dimers, by chain ids. """ if os.path.exists("cache.hdf5"): os.remove("cache.hdf5") vdw_dict = {'ALA': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.165, 'C': 1.87, 'O': 1.55}, 'ARG': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 2.235, 'NE': 1.83, 'HE': 0.8, 'CZ': 1.87, 'NH1': 1.83, 'HH11': 0.6, 'HH12': 0.6, 'NH2': 1.83, 'HH21': 0.6, 'HH22': 0.6, 'C': 1.87, 'O': 1.55}, 'ARGN': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 2.235, 'NE': 1.83, 'HE': 0.8, 'CZ': 1.87, 'NH1': 1.83, 'HH11': 0.6, 'HH12': 0.6, 'NH2': 1.83, 'HH21': 0.6, 'C': 1.87, 'O': 1.55}, 'ASN': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 1.87, 'OD1': 1.55, 'ND2': 1.83, 'HD21': 0.8, 'HD22': 0.8, 'C': 1.87, 'O': 1.55, 'AD1': 1.55, 'AD2': 1.83}, 'ASP': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 1.87, 'OD1': 1.66, 'OD2': 1.66, 'C': 1.87, 'O': 1.55}, 'ASPH': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 1.87, 'OD1': 1.52, 'OD2': 1.55, 'HD': 0.8, 'C': 1.87, 'O': 1.55}, 'CYS': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'SG': 1.89, 'C': 1.87, 'O': 1.55}, 'GLN': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 1.87, 'OE1': 1.55, 'NE2': 1.83, 'HE21': 0.8, 'HE22': 0.8, 'C': 1.87, 'O': 1.55, 'AE1': 1.55, 'AE2': 1.83}, 'GLU': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 1.87, 'OE1': 1.66, 'OE2': 1.66, 'C': 1.87, 'O': 1.55}, 'GLUH': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 1.87, 'OE1': 1.52, 'OE2': 1.55, 'HE': 0.8, 'C': 1.87, 'O': 1.55}, 'GLY': {'N': 1.83, 'H': 0.8, 'CA': 2.235, 'C': 1.87, 'O': 1.55}, 'HIS': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.04, 'ND1': 1.72, 'HD1': 0.8, 'CD2': 2.1, 'NE2': 1.72, 'CE1': 2.1, 'C': 1.87, 'O': 1.55, 'AE1': 2.1, 'AE2': 1.72}, 'ILE': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.265, 'CG2': 2.165, 'CG1': 2.235, 'CD1': 2.165, 'C': 1.87, 'O': 1.55, 'CD': 2.165}, 'LEU': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.265, 'CD1': 2.165, 'CD2': 2.165, 'C': 1.87, 'O': 1.55}, 'LYS': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 2.235, 'CE': 2.235, 'NZ': 1.65, 'HZ': 0.6, 'HZ1': 0.6, 'HZ2': 0.6, 'HZ3': 0.6, 'C': 1.87, 'O': 1.55}, 'LYSN': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'CD': 2.235, 'CE': 2.235, 'NZ': 1.65, 'HZ1': 0.8, 'HZ2': 0.8, 'C': 1.87, 'O': 1.55}, 'MET': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'SD': 1.97, 'CE': 2.165, 'C': 1.87, 'O': 1.55}, 'PHE': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.04, 'CD1': 1.99, 'CD2': 1.99, 'CE1': 1.99, 'CE2': 1.99, 'CZ': 1.99, 'C': 1.87, 'O': 1.55}, 'PRO': {'N': 1.83, 'CD': 2.235, 'CA': 2.265, 'CB': 2.235, 'CG': 2.235, 'C': 1.87, 'O': 1.55}, 'SER': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'OG': 1.55, 'HG': 0.76, 'C': 1.87, 'O': 1.55}, 'THR': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.265, 'OG1': 1.55, 'HG1': 0.76, 'CG2': 2.165, 'C': 1.87, 'O': 1.55}, 'TRP': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.04, 'CD2': 2.04, 'CE2': 2.04, 'CE3': 1.99, 'CD1': 2.1, 'NE1': 1.72, 'HE1': 0.8, 'CZ2': 1.99, 'CZ3': 1.99, 'CH2': 1.99, 'C': 1.87, 'O': 1.55}, 'TYR': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.04, 'CD1': 1.99, 'CE1': 1.99, 'CD2': 1.99, 'CE2': 1.99, 'CZ': 2.04, 'OH': 1.55, 'HH': 0.76, 'C': 1.87, 'O': 1.55}, 'VAL': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.265, 'CG1': 2.165, 'CG2': 2.165, 'C': 1.87, 'O': 1.55}, 'HSC': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.04, 'CD2': 2.1, 'ND1': 1.72, 'HD1': 0.8, 'CE1': 2.1, 'NE2': 1.72, 'HE2': 0.8, 'C': 1.87, 'O': 1.55}, 'HSD': {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.235, 'CG': 2.04, 'ND1': 1.72, 'CE1': 2.1, 'CD2': 2.1, 'NE2': 1.72, 'HE2': 0.8, 'C': 1.87, 'O': 1.55}, 'ACE': {'C': 1.87, 'O': 1.55, 'CH3': 2.165}} default_dict = {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.265, 'C': 1.87, 'O': 1.55, 'CG': 2.265, 'CD': 2.235, 'NE': 1.83, 'HE': 0.8, 'C1': 1.87, 'C2': 1.87, 'C3': 1.87, 'CZ': 2.04, 'NH1': 1.83, 'H1': 0.8, 'H2': 0.8, 'H3': 0.8, 'H11': 0.8, 'H12': 0.8, 'H31': 0.8, 'H32': 0.8, 'HA' : 0.8, 'HA2' : 0.8, 'HA3' : 0.8, 'HB' : 0.8, 'HB1' : 0.8, 'HB2' : 0.8, 'HB3' : 0.8, 'HH2': 0.6, 'HH11': 0.6, 'HH12': 0.6, 'NH2': 1.83, 'HH21': 0.6, 'HH22': 0.6, 'OD1': 1.66, 'ND2': 1.83, 'HD21': 0.8, 'HD22': 0.8, 'AD1': 1.55, 'AD2': 1.83, 'OD2': 1.66, 'HD': 0.8, 'SG': 1.89, 'OE1': 1.66, 'NE2': 1.83, 'HE2': 0.8, 'HE3': 0.8, 'HE21': 0.8, 'HE22': 0.8, 'AE1': 2.1, 'AE2': 1.83, 'OE2': 1.66, 'ND1': 1.72, 'HD1': 0.8, 'HD2': 0.8, 'HD3': 0.8, 'HD11': 0.8, 'HD12': 0.8, 'HD13': 0.8, 'HD23': 0.8, 'CD2': 2.165, 'CE1': 2.1, 'CG2': 2.165, 'CG1': 2.235, 'CD1': 2.165, 'CE': 2.235, 'NZ': 1.65, 'HZ': 0.6, 'HZ1': 0.8, 'HZ2': 0.8, 'HZ3': 0.6, 'SD': 1.97, 'CE2': 2.04, 'OG': 1.55, 'HG': 0.76, 'OG1': 1.55, 'HG1': 0.76, 'HG2': 0.76, 'HG3': 0.76, 'HG11': 0.76, 'HG12': 0.76, 'HG13': 0.76, 'HG21': 0.76, 'HG22': 0.76, 'HG23': 0.76, 'CE3': 1.99, 'NE1': 1.72, 'HE1': 0.8, 'HO1': 0.8, 'HO2': 0.8, 'HO3': 0.8, 'CZ2': 1.99, 'CZ3': 1.99, 'CH2': 1.99, 'OH': 1.55, 'OXT': 1.55, 'O1': 1.55, 'O2': 1.55, 'O3': 1.55, 'HH': 0.76, 'HE2': 0.8, 'CH3': 2.165} no_digit_dict = {'N': 1.83, 'H': 0.8, 'CA': 2.265, 'CB': 2.265, 'C': 1.87, 'O': 1.55, 'CG': 2.265, 'CD': 2.235, 'NE': 1.83, 'HE': 0.8, 'CZ': 2.04, 'NH': 1.83, 'HA': 0.8, 'HB': 0.8, 'HH': 0.76, 'OD': 1.66, 'ND': 1.83, 'HD': 0.8, 'AD': 1.83, 'SG': 1.89, 'OE': 1.66, 'AE': 2.1, 'CE': 2.235, 'NZ': 1.65, 'HZ': 0.8, 'SD': 1.97, 'OG': 1.55, 'HG': 0.76, 'HO': 0.8, 'CH': 2.165, 'OH': 1.55, 'OXT': 1.55, 'D': 0.8, 'DZ': 0.8, 'HXT': 0.8, 'HAB': 0.8} chain_1 = dimer[dimer.chain_id == c1] chain_2 = dimer[dimer.chain_id == c2] chain_1_coord = chain_1.coord chain_2_coord = chain_2.coord try: chain_1_vdw = [float(vdw_dict[chain_1[i].res_name][chain_1[i].atom_name]) for i in range(len(chain_1))] except: chain_1_vdw = [] for i in range(len(chain_1)): if chain_1[i].res_name in vdw_dict and chain_1[i].atom_name in vdw_dict[chain_1[i].res_name]: chain_1_vdw.append(float(vdw_dict[chain_1[i].res_name][chain_1[i].atom_name])) elif chain_1[i].atom_name in default_dict: chain_1_vdw.append(float(default_dict[chain_1[i].atom_name])) else: chain_1_vdw.append(float(no_digit_dict[chain_1[''.join([j for j in chain_1[i].atom_name if not j.isdigit()])].atom_name])) try: chain_2_vdw = [float(vdw_dict[chain_2[i].res_name][chain_2[i].atom_name]) for i in range(len(chain_2))] except: chain_2_vdw = [] for i in range(len(chain_2)): if chain_2[i].res_name in vdw_dict and chain_2[i].atom_name in vdw_dict[chain_2[i].res_name]: chain_2_vdw.append(float(vdw_dict[chain_2[i].res_name][chain_2[i].atom_name])) elif chain_2[i].atom_name in default_dict: chain_2_vdw.append(float(default_dict[chain_2[i].atom_name])) else: chain_2_vdw.append(float(no_digit_dict[chain_2[''.join([j for j in chain_1[i].atom_name if not j.isdigit()])].atom_name])) if len(chain_1) * len(chain_2) * 8 > mem * (1024**3)/3: hdf5_file = True hdf5_store = h5py.File("cache.hdf5", "a") critdist_array = hdf5_store.create_dataset("critdist_array", (len(chain_1), len(chain_2))) n = len(chain_2) chunks = 10 chunk_size = [[i * int(n / chunks), (i + 1) * int(n / chunks)] if i + 1 != chunks else [i * int(n / chunks), n] for i in range(chunks)] for start, end in chunk_size: critdist_array[:, start:end] = np.array(chain_2_vdw[start:end]) + np.array(chain_1_vdw).reshape((-1, 1)) + vdwC realdist_array = hdf5_store.create_dataset('realdist_array', data=cdist(chain_1_coord, chain_2_coord, 'euclidean')) contacting_bool = np.zeros(realdist_array.shape, dtype=bool) n = len(realdist_array) chunks = 500 chunk_size = [[i * int(n / chunks), (i + 1) * int(n / chunks)] if i + 1 != chunks else [i * int(n / chunks), n] for i in range(chunks)] for start, end in chunk_size: contacting_bool[start:end, :] = realdist_array[start:end] <= critdist_array[start:end] contacting_1 = np.unique(chain_1[np.any(contacting_bool, axis=1)].res_id) contacting_2 = np.unique(chain_2[np.any(contacting_bool, axis=0)].res_id) else: hdf5_file = False realdist_array = cdist(chain_1_coord, chain_2_coord, 'euclidean') critdist_array = np.array(chain_2_vdw) + np.array(chain_1_vdw).reshape((-1, 1)) + vdwC contacting_1 = np.unique(chain_1[np.any(realdist_array <= critdist_array, axis=1)].res_id) contacting_2 = np.unique(chain_2[np.any(realdist_array <= critdist_array, axis=0)].res_id) if len(contacting_1) < minC or len(contacting_2) < minC: if hdf5_file is True: hdf5_store.close() os.remove("cache.hdf5") return None contacting = dimer[np.logical_or(np.logical_and(dimer.chain_id == c1, np.isin(dimer.res_id, contacting_1)), np.logical_and(dimer.chain_id == c2, np.isin(dimer.res_id, contacting_2)))] contacting_ca = contacting[contacting.atom_name == 'CA'] dimer_ca = dimer[dimer.atom_name == 'CA'] pos_nearby_ca = dimer_ca[~struc.filter_intersection(dimer_ca, contacting_ca)] nearby_array = np.zeros((len(pos_nearby_ca), len(contacting_ca))) for i in range(len(pos_nearby_ca)): nearby_array[i] = struc.distance(pos_nearby_ca[i], contacting_ca) nearby_ca = pos_nearby_ca[np.any(nearby_array <= nearC, axis = 1)] nearby_1 = nearby_ca[nearby_ca.chain_id == c1].res_id nearby_2 = nearby_ca[nearby_ca.chain_id == c2].res_id nearby = dimer[np.logical_or(np.logical_and(dimer.chain_id == c1, np.isin(dimer.res_id, nearby_1)), np.logical_and(dimer.chain_id == c2, np.isin(dimer.res_id, nearby_2)))] interface_1 = sorted(list(contacting_1) + list(nearby_1)) interface_2 = sorted(list(contacting_2) + list(nearby_2)) interface = dimer[np.logical_or(np.logical_and(dimer.chain_id == c1, np.isin(dimer.res_id, interface_1)), np.logical_and(dimer.chain_id == c2, np.isin(dimer.res_id, interface_2)))] if hdf5_file is True: hdf5_store.close() os.remove("cache.hdf5") return ", ".join(map(str, contacting_1)), ", ".join(map(str, contacting_2)), ", ".join(map(str, nearby_1)), ", ".join(map(str, nearby_2)), interface
def test_intersection_filter(sample_array): assert len(sample_array[:200][struc.filter_intersection( sample_array[:200], sample_array[100:])]) == 100