def extract_feature(self): print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: protrusion_file = self.__get_dir_name() + protein.name if not os.path.exists(protrusion_file+".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" result_dict = run_psaia(pdb_file) protrusion_array = np.zeros((len(protein.residues), 5 + 5 + 5 + 6 + 6 + 1)) if result_dict is not None: for index, residue in enumerate(protein.biopython_residues): key = self.get_residue_id(residue.get_full_id()) if key in result_dict: values = result_dict[key] protrusion_array[index, :] = self._normalize_features(*values) else: print('key not found in PSAIA processing!') np.save(protrusion_file, protrusion_array) protrusion_array = np.load(protrusion_file+".npy") for index, residue in enumerate(protein.residues): residue.add_feature(Features.PROTRUSION_INDEX, protrusion_array[index, 21:]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def __compute_profiles(self, db='nr', niter=3): print_info_nn(" >>> Adding the profile features for dataset {0} ...".format(self._database.name)) start_time = datetime.now() for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: fasta_file = self._database.directory + unbound_sequence_directory + protein.name + ".fasta" output_file = self._database.directory + pssm_directory + protein.name if not os.path.exists(output_file + ".mat"): print_info("... processing protein {0} ... ".format(protein.name)) command = "cd {4} \n " \ "{5} " \ "-query {0} -db {1} -out {2}.psi.txt -num_iterations {3} -out_ascii_pssm {2}.mat" \ .format(fasta_file, db, output_file, niter, psiblast_db_folder, psiblast_executable) print_info(command) error_code = os.system(command) if error_code == 0: print_info('Successful!') else: print_error('Failed with error code {0}'.format(error_code)) pssm, psfm, info = ProfileExtractor.__parse_pssm_file(output_file + ".mat") wpssm = ProfileExtractor.__get_wpsm(pssm) wpsfm = ProfileExtractor.__get_wpsm(psfm) for i, res in enumerate(protein.residues): res.add_feature(Features.POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(pssm[:, i])) res.add_feature(Features.POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(psfm[:, i])) res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_SCORING_MATRIX, self._normalize(wpssm[:, i])) res.add_feature(Features.WINDOWED_POSITION_SPECIFIC_FREQUENCY_MATRIX, self._normalize(wpsfm[:, i])) print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def extract_feature(self): seed(self.seed) counter = 0 print_info_nn(" >>> Adding D2 category based shape distribution for database {0} ... ".format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros((len(protein.residues), self.number_of_bins)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = neighbour_search.search(residue.center, self.radius, "R") distributions[i, :] = self._compute_distribution(nearby_residues) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.D2_CATEGORY_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): seed(self.seed) print_info_nn( " >>> Adding D1 surface atoms shape distribution for {0} ... ". format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): print_info("{0}".format(protein.name)) pdb_file_name = self._database.directory + pdb_directory + protein.name + '.pdb' surface, normals = get_surface_atoms(pdb_file_name) distributions = np.zeros( (len(protein.residues), 2 * (self.number_of_bins + 1))) for i in range(len(protein.residues)): residue = protein.residues[i] distributions[i, :] = self.get_distributions( residue.center, surface, normals) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): secondary_structure_dict = dict( zip(ss_abbreviations, range(len(ss_abbreviations)))) print_info_nn( " >>> Adding secondary structure for database {0} ... ".format( self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: stride_x_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(stride_x_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" n = len(protein.residues) stride_x = stride_dict_from_pdb_file(pdb_file) stride_x_array = np.zeros((n, 11)) for index, residue in enumerate( protein.biopython_residues): key = self.get_residue_id(residue.get_full_id()) if key in stride_x: (_, s, phi, psi, asa, rasa) = stride_x[key] if s not in secondary_structure_dict: raise ValueError( "unknown secondary structure! Add to dictionary!" ) ss = np.zeros(len(secondary_structure_dict)) ss[secondary_structure_dict[s]] = 1 stride_x_array[index, :7] = ss stride_x_array[index, 7] = phi stride_x_array[index, 8] = psi stride_x_array[index, 9] = asa stride_x_array[index, 10] = rasa np.save(stride_x_file, stride_x_array) stride_x = np.load(stride_x_file) for i, res in enumerate(protein.residues): res.add_feature(Features.SECONDARY_STRUCTURE, stride_x[i, :7]) res.add_feature(Features.PHI, stride_x[i, 7]) res.add_feature(Features.PSI, stride_x[i, 8]) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, stride_x[i, 9]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, stride_x[i, 10]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): counter = 0 overall_time = datetime.now() number_of_amino_acids = len(standard_aa_names) print_info_nn(" >>> Adding Half Surface Exposure ... ".format(self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: hse_file = self._get_dir_name() + protein.name if not os.path.exists(hse_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) number_of_residues = len(protein.biopython_residues) un = np.zeros(number_of_residues) dn = np.zeros(number_of_residues) uc = np.zeros((number_of_amino_acids, number_of_residues)) dc = np.zeros((number_of_amino_acids, number_of_residues)) for index, residue in enumerate(protein.biopython_residues): u = self.get_side_chain_vector(residue) if u is None: un[index] = np.nan dn[index] = np.nan uc[:, index] = np.nan dc[:, index] = np.nan else: residue_index = self._residue_index_table[residue.get_resname()] uc[residue_index, index] += 1 dc[residue_index, index] += 1 neighbours_indices = protein.residues[index].get_feature(Features.RESIDUE_NEIGHBOURHOOD) # print neighbours_indices for neighbour_index in neighbours_indices: if neighbour_index == -1: break neighbour_residue = protein.biopython_residues[int(neighbour_index)] if is_aa(neighbour_residue) and neighbour_residue.has_id('CA'): neighbour_vector = neighbour_residue['CA'].get_vector() residue_index = self._residue_index_table[neighbour_residue.get_resname()] if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0: un[index] += 1 uc[residue_index, index] += 1 else: dn[index] += 1 dc[residue_index, index] += 1 uc = (uc / (1.0 + un)).T dc = (dc / (1.0 + dn)).T hse_array = np.hstack((uc, dc)) np.save(hse_file, hse_array) hse = np.load(hse_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.HALF_SPHERE_EXPOSURE, hse[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def __extract_examples(self): """ This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C, wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are considered as negative examples. Extracted examples are saved in self.examples """ print_info("Finding the positive and negative examples in DBD4 ... {0}".format(self.positives_size)) start_time = datetime.now() counter = 1 start_index = 0 neg_no = 0 pos_no = 0 for complex_name in self.complexes.keys(): print_info_nn("{0}/{1}... processing complex {2}".format(counter, len(self.complexes), complex_name)) protein_complex = self.complexes[complex_name] bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues bound_ligand_residues = protein_complex.bound_formation.ligand.residues bound_receptor_residues = protein_complex.bound_formation.receptor.residues pos = [] neg = [] for i in range(len(bound_ligand_bio_residues)): for j in range(len(bound_receptor_bio_residues)): bound_ligand_residue = bound_ligand_bio_residues[i] bound_receptor_residue = bound_receptor_bio_residues[j] l_atoms = [atom.get_coord() for atom in bound_ligand_residue.get_list()] r_atoms = [atom.get_coord() for atom in bound_receptor_residue.get_list()] dist_mat = cdist(l_atoms, r_atoms) ligand_b2u = protein_complex.ligand_bound_to_unbound receptor_b2u = protein_complex.receptor_bound_to_unbound # if the residues have an unbound counterpart # this is due to the fact that the unbound and bound formations may have slightly different residues if bound_ligand_residues[i] in ligand_b2u and bound_receptor_residues[j] in receptor_b2u: unbound_ligand_res = ligand_b2u[bound_ligand_residues[i]] unbound_receptor_res = receptor_b2u[bound_receptor_residues[j]] unbound_ligand_res_index = self.__get_residue_index(unbound_ligand_res) unbound_receptor_res_index = self.__get_residue_index(unbound_receptor_res) if dist_mat.min() < self.interaction_threshold: pos.append((unbound_ligand_res_index, unbound_receptor_res_index, +1)) else: neg.append((unbound_ligand_res_index, unbound_receptor_res_index, -1)) self.examples.extend(copy.copy(pos)) self.examples.extend(copy.copy(neg)) pos_no += len(pos) neg_no += len(neg) self.complexes_example_range[complex_name] = ( start_index, start_index + len(pos), start_index + len(neg) + len(pos)) print_info(" ( {0:03d}/{1:05d} ) -{2}".format(len(pos), len(neg), self.complexes_example_range[complex_name])) start_index += len(pos) + len(neg) counter += 1 all_e = pos + neg for e in all_e: self.example_complex["{0}_{1}".format(e[0], e[1])] = complex_name print_info("Finding examples in DBD4 took " + str((datetime.now() - start_time).seconds) + " seconds. ") print_info("The total number of examples found: " + str(pos_no + neg_no))
def extract_feature(self): seed(self.seed) print_info_nn( " >>> Adding D1 surface shape distribution for database {0} ... ". format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros( (len(protein.residues), self.number_of_bins + 1)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = [protein.biopython_residues[i]] temp_nearby_residues = neighbour_search.search( residue.center, self.radius, "R") for nearby_residue in temp_nearby_residues: if nearby_residue not in protein.biopython_residues: continue residues_index = protein.biopython_residues.index( nearby_residue) residue = protein.residues[residues_index] if residue.get_feature( Features.RELATIVE_ACCESSIBLE_SURFACE_AREA ) >= self.rASA_threshold: nearby_residues.append(nearby_residue) distributions[i, :] = self._compute_distribution( nearby_residues, residue.center) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn( " >>> Adding secondary structure for database {0} ... ".format( self._database.name)) overall_time = datetime.now() if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: dssp_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(dssp_file): print_info_nn("... running DSSP for protein " + protein.name) start_time = datetime.now() dssp = DSSP( protein.structure[0], self._database.directory + pdb_directory + protein.name + ".pdb") dssp_array = np.ndarray((len(protein.residues), 6)) for (i, res) in enumerate(protein.biopython_residues): (_, _, cid, rid) = res.get_full_id() key = (cid, rid) if key in dssp: dssp_array[i, 2:] = (dssp[key])[2:] else: dssp_array[i, 2:] = [0, 0, 0, 0] # print_error("WTH") # sys.exit(0) # print('here') # pdb.set_trace() # self.SS[:, index] = np.nan # self.ASA[index] = np.nan # self.rASA[index] = np.nan # self.Phi[index] = np.nan # self.Psi[index] = np.nan np.save(dssp_file, dssp_array) print_info("took {0} seconds.".format( (datetime.now() - start_time).seconds)) dssp = np.load(dssp_file) for i, res in enumerate(protein.residues): # (_, s, ASA, rASA, phi, psi) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i, 2]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, dssp[i, 3]) res.add_feature(Features.PHI, dssp[i, 4]) res.add_feature(Features.PSI, dssp[i, 5]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): secondary_structure_dict = dict(zip(ss_abbreviations, range(len(ss_abbreviations)))) print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: stride_x_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(stride_x_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" n = len(protein.residues) stride_x = stride_dict_from_pdb_file(pdb_file) stride_x_array = np.zeros((n, 11)) for index, residue in enumerate(protein.biopython_residues): key = self.get_residue_id(residue.get_full_id()) if key in stride_x: (_, s, phi, psi, asa, rasa) = stride_x[key] if s not in secondary_structure_dict: raise ValueError("unknown secondary structure! Add to dictionary!") ss = np.zeros(len(secondary_structure_dict)) ss[secondary_structure_dict[s]] = 1 stride_x_array[index, :7] = ss stride_x_array[index, 7] = phi stride_x_array[index, 8] = psi stride_x_array[index, 9] = asa stride_x_array[index, 10] = rasa np.save(stride_x_file, stride_x_array) stride_x = np.load(stride_x_file) for i, res in enumerate(protein.residues): res.add_feature(Features.SECONDARY_STRUCTURE, stride_x[i, :7]) res.add_feature(Features.PHI, stride_x[i, 7]) res.add_feature(Features.PSI, stride_x[i, 8]) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, stride_x[i, 9]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, stride_x[i, 10]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn(" >>> Adding residue depth for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: residue_depth_file = self._get_dir_name() + protein.name + ".npy" if not os.path.exists(residue_depth_file): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) pdb_file = self._database.directory + pdb_directory + protein.name + ".pdb" rd = ResidueDepth(protein.structure[0], pdb_file) rd_array = np.ndarray((len(protein.residues), 2)) # self.number_of_bins + # surface = get_surface(pdb_file) for (i, res) in enumerate(protein.biopython_residues): (_, _, c, (h, rn, ic)) = res.get_full_id() key = (c, (h, rn, ic)) if key in rd: rdv = rd[key] if rdv[0] is None: rdv = (0, rdv[1]) print "WTH?" if rdv[1] is None: rdv = (rdv[0], 0) print "WTH?" rd_array[i, :2] = rdv else: print_error('WTH') rd_array[i, :2] = [0, 0] # rd_array[i, 2:] = self._compute_distribution_(surface, protein.residues[i].center) np.save(residue_depth_file, rd_array) surface_features = np.load(residue_depth_file) for i, res in enumerate(protein.residues): res.add_feature(Features.RESIDUE_DEPTH, self._normalize(surface_features[i, :2])) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): seed(self.seed) counter = 0 overall_time = datetime.now() print_info_nn( " >>> Adding D2 shape distribution for database {0} ... ".format( self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros( (len(protein.residues), self.number_of_bins)) # distributions = np.zeros((len(protein.residues), self.number_of_bins+2)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = neighbour_search.search( residue.center, self.radius, "R") distributions[i, :] = self._compute_distribution( nearby_residues) # distributions[i:, -1] = len(nearby_residues) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.D2_PLAIN_SHAPE_DISTRIBUTION, distributions[i, :]) # protein.residues[i].add_feature(Features.NUMBER_OF_NEIGHBOURS, distributions[i, -1]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def extract_feature(self): counter = 0 print_info_nn(" >>> Adding Residue Neighbourhood ... ") overall_time = datetime.now() if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: residue_neighbourhood_file = self._get_dir_name() + protein.name if not os.path.exists(residue_neighbourhood_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) neighbourhood = [] max_length = 0 for i, query_residue in enumerate(protein.residues): neighbourhood.append([]) for j, neighbour_residue in enumerate(protein.residues): # if i == j: # continue distance = cdist(query_residue.get_coordinates(), neighbour_residue.get_coordinates()).min() similarity = np.exp(-(distance ** 2) / self._sigma) if distance <= 7.5: neighbourhood[-1].append(j) if len(neighbourhood[-1]) > max_length: max_length = len(neighbourhood[-1]) neighbourhood_array = -np.ones((len(protein.residues), max_length)) # print len(neighbourhood) for i, residue_neighbourhood in enumerate(neighbourhood): for j, neighbour_index in enumerate(neighbourhood[i]): neighbourhood_array[i, j] = neighbourhood[i][j] # print neighbourhood_array[i, :] np.save(residue_neighbourhood_file, neighbourhood_array) neighbourhood_array = np.load(residue_neighbourhood_file+".npy") for index, residue in enumerate(protein.residues): residue.add_feature(Features.RESIDUE_NEIGHBOURHOOD, neighbourhood_array[index, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): print_info_nn(" >>> Adding secondary structure for database {0} ... ".format(self._database.name)) overall_time = datetime.now() if not os.path.exists(self.__get_dir_name()): os.mkdir(self.__get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: dssp_file = self.__get_dir_name() + protein.name + ".npy" if not os.path.exists(dssp_file): print_info_nn("... running DSSP for protein " + protein.name) start_time = datetime.now() dssp = DSSP(protein.structure[0], self._database.directory + pdb_directory + protein.name + ".pdb") dssp_array = np.ndarray((len(protein.residues), 6)) for (i, res) in enumerate(protein.biopython_residues): (_, _, cid, rid) = res.get_full_id() key = (cid, rid) if key in dssp: dssp_array[i, 2:] = (dssp[key])[2:] else: dssp_array[i, 2:] = [0, 0, 0, 0] # print_error("WTH") # sys.exit(0) # print('here') # pdb.set_trace() # self.SS[:, index] = np.nan # self.ASA[index] = np.nan # self.rASA[index] = np.nan # self.Phi[index] = np.nan # self.Psi[index] = np.nan np.save(dssp_file, dssp_array) print_info("took {0} seconds.".format((datetime.now() - start_time).seconds)) dssp = np.load(dssp_file) for i, res in enumerate(protein.residues): # (_, s, ASA, rASA, phi, psi) res.add_feature(Features.ACCESSIBLE_SURFACE_AREA, dssp[i, 2]) res.add_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA, dssp[i, 3]) res.add_feature(Features.PHI, dssp[i, 4]) res.add_feature(Features.PSI, dssp[i, 5]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def extract_feature(self): seed(self.seed) print_info_nn(" >>> Adding D1 surface shape distribution for database {0} ... ".format(self._database.name)) overall_time = datetime.now() counter = 0 if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: shape_dist_file = self._get_dir_name() + protein.name if not os.path.exists(shape_dist_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) atoms = protein.atoms neighbour_search = NeighborSearch(atoms) distributions = np.zeros((len(protein.residues), self.number_of_bins + 1)) for i in range(len(protein.residues)): residue = protein.residues[i] nearby_residues = [protein.biopython_residues[i]] temp_nearby_residues = neighbour_search.search(residue.center, self.radius, "R") for nearby_residue in temp_nearby_residues: if nearby_residue not in protein.biopython_residues: continue residues_index = protein.biopython_residues.index(nearby_residue) residue = protein.residues[residues_index] if residue.get_feature(Features.RELATIVE_ACCESSIBLE_SURFACE_AREA) >= self.rASA_threshold: nearby_residues.append(nearby_residue) distributions[i, :] = self._compute_distribution(nearby_residues, residue.center) np.save(shape_dist_file, distributions) distributions = np.load(shape_dist_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.D1_SURFACE_SHAPE_DISTRIBUTION, distributions[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def _load(self, file_name=None): """ This function load all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name) print_info_nn("Loading the object model from {0} ... ".format( object_model_file_name)) start_time = datetime.now() (self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex) = cPickle.load(f) f.close() gc.collect() print_info("took {0} seconds.".format( (datetime.now() - start_time).seconds))
def _save(self, file_name=None): """ This function saves all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if not os.path.exists(self.directory + pickle_directory): os.mkdir(self.directory + pickle_directory) if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name, "wb") print_info_nn("Saving the object model into {0} ... ".format( object_model_file_name)) start_time = datetime.now() cPickle.dump((self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex), f) f.close() print_info("took {0} seconds.".format( (datetime.now() - start_time).seconds))
def _load(self, file_name=None): """ This function load all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name) print_info_nn("Loading the object model from {0} ... ".format(object_model_file_name)) start_time = datetime.now() (self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex) = cPickle.load(f) f.close() gc.collect() print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def _save(self, file_name=None): """ This function saves all the attributes of the class: positive and negative examples, ligands and receptors and complex names are saved in pickle format. """ if not os.path.exists(self.directory + pickle_directory): os.mkdir(self.directory + pickle_directory) if file_name is None: object_model_file_name = self.directory + pickle_directory + dbd4_object_model_file else: object_model_file_name = file_name f = open(object_model_file_name, "wb") print_info_nn("Saving the object model into {0} ... ".format(object_model_file_name)) start_time = datetime.now() cPickle.dump((self.directory, self.complexes, self.residues, self.complexes_example_range, self.examples, self.example_complex), f) f.close() print_info("took {0} seconds.".format((datetime.now() - start_time).seconds))
def extract_feature(self): counter = 0 overall_time = datetime.now() print_info_nn(" >>> Adding B Factor ... ".format(self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: b_factor_filename = self._get_dir_name() + protein.name if not os.path.exists(b_factor_filename + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) b_factor_array = np.zeros(len(protein.residues)) for (index, residue) in enumerate(protein.biopython_residues): b_factor_array[index] = max( [atom.get_bfactor() for atom in residue]) np.save(b_factor_filename, b_factor_array) b_factor_array = np.load(b_factor_filename + ".npy") # print b_factor_array for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.B_VALUE, b_factor_array[i]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def __extract_examples(self): """ This function returns the set of all positive and negative examples from DBD4 dataset. In protein complex C, wth receptor R and ligand L, two residues r on R and r' on L are considered as a positive example if in the bound form they are nearer than the threshold distance. All other pairs (r,r') with r on R and r' on L are considered as negative examples. Extracted examples are saved in self.examples """ print_info( "Finding the positive and negative examples in DBD4 ... {0}". format(self.positives_size)) start_time = datetime.now() counter = 1 start_index = 0 neg_no = 0 pos_no = 0 for complex_name in self.complexes.keys(): print_info_nn("{0}/{1}... processing complex {2}".format( counter, len(self.complexes), complex_name)) protein_complex = self.complexes[complex_name] bound_ligand_bio_residues = protein_complex.bound_formation.ligand.biopython_residues bound_receptor_bio_residues = protein_complex.bound_formation.receptor.biopython_residues bound_ligand_residues = protein_complex.bound_formation.ligand.residues bound_receptor_residues = protein_complex.bound_formation.receptor.residues pos = [] neg = [] for i in range(len(bound_ligand_bio_residues)): for j in range(len(bound_receptor_bio_residues)): bound_ligand_residue = bound_ligand_bio_residues[i] bound_receptor_residue = bound_receptor_bio_residues[j] l_atoms = [ atom.get_coord() for atom in bound_ligand_residue.get_list() ] r_atoms = [ atom.get_coord() for atom in bound_receptor_residue.get_list() ] dist_mat = cdist(l_atoms, r_atoms) ligand_b2u = protein_complex.ligand_bound_to_unbound receptor_b2u = protein_complex.receptor_bound_to_unbound # if the residues have an unbound counterpart # this is due to the fact that the unbound and bound formations may have slightly different residues if bound_ligand_residues[ i] in ligand_b2u and bound_receptor_residues[ j] in receptor_b2u: unbound_ligand_res = ligand_b2u[ bound_ligand_residues[i]] unbound_receptor_res = receptor_b2u[ bound_receptor_residues[j]] unbound_ligand_res_index = self.__get_residue_index( unbound_ligand_res) unbound_receptor_res_index = self.__get_residue_index( unbound_receptor_res) if dist_mat.min() < self.interaction_threshold: pos.append((unbound_ligand_res_index, unbound_receptor_res_index, +1)) else: neg.append((unbound_ligand_res_index, unbound_receptor_res_index, -1)) self.examples.extend(copy.copy(pos)) self.examples.extend(copy.copy(neg)) pos_no += len(pos) neg_no += len(neg) self.complexes_example_range[complex_name] = (start_index, start_index + len(pos), start_index + len(neg) + len(pos)) print_info(" ( {0:03d}/{1:05d} ) -{2}".format( len(pos), len(neg), self.complexes_example_range[complex_name])) start_index += len(pos) + len(neg) counter += 1 all_e = pos + neg for e in all_e: self.example_complex["{0}_{1}".format(e[0], e[1])] = complex_name print_info("Finding examples in DBD4 took " + str((datetime.now() - start_time).seconds) + " seconds. ") print_info("The total number of examples found: " + str(pos_no + neg_no))
def extract_feature(self): counter = 0 overall_time = datetime.now() number_of_amino_acids = len(standard_aa_names) print_info_nn(" >>> Adding Half Surface Exposure ... ".format( self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: hse_file = self._get_dir_name() + protein.name if not os.path.exists(hse_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) number_of_residues = len(protein.biopython_residues) un = np.zeros(number_of_residues) dn = np.zeros(number_of_residues) uc = np.zeros((number_of_amino_acids, number_of_residues)) dc = np.zeros((number_of_amino_acids, number_of_residues)) for index, residue in enumerate( protein.biopython_residues): u = self.get_side_chain_vector(residue) if u is None: un[index] = np.nan dn[index] = np.nan uc[:, index] = np.nan dc[:, index] = np.nan else: residue_index = self._residue_index_table[ residue.get_resname()] uc[residue_index, index] += 1 dc[residue_index, index] += 1 neighbours_indices = protein.residues[ index].get_feature( Features.RESIDUE_NEIGHBOURHOOD) # print neighbours_indices for neighbour_index in neighbours_indices: if neighbour_index == -1: break neighbour_residue = protein.biopython_residues[ int(neighbour_index)] if is_aa(neighbour_residue ) and neighbour_residue.has_id('CA'): neighbour_vector = neighbour_residue[ 'CA'].get_vector() residue_index = self._residue_index_table[ neighbour_residue.get_resname()] if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0: un[index] += 1 uc[residue_index, index] += 1 else: dn[index] += 1 dc[residue_index, index] += 1 uc = (uc / (1.0 + un)).T dc = (dc / (1.0 + dn)).T hse_array = np.hstack((uc, dc)) np.save(hse_file, hse_array) hse = np.load(hse_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.HALF_SPHERE_EXPOSURE, hse[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))