def read_PDB_files(pdbids,directory="",show_info=True): path = Path.cwd() / directory path.mkdir(parents=True,exist_ok=True) # with open(pdbids) as input_pdbs: # pdbids = list(map(lambda x: x.split(","),input_pdbs.read().split("\n"))) # pdbids = list(filter(lambda x: x[0][0] != "#",pdbids)) pdb_structures = dict() info = dict() for i in pdbids: if (path / ("pdb%s.ent" %i.lower())).exists(): f_name = str(path / ("pdb%s.ent" %i.lower())) elif (path / ("%s.pdb" %i.lower())).exists(): f_name = str(path / ("%s.pdb" %i.lower())) else: f_name = pdbl().retrieve_pdb_file(pdb_code=i,pdir=directory,file_format='pdb') #chains = [i[2*n+1:2*n+3] for n in range(len(i)//2)] structure = PDBParser(QUIET=True).get_structure(i,f_name) pdb_structures[i] = structure num_models = len([j for j in structure.get_models()]) if num_models > 1: info[i] = num_models #exit() #chains = {k:v for k,v in } # if show_info: # print("%d pdbs have multiple models:"%len(info)) # for k,v in sorted(info.items()): # print("%s-(%d)"%(k,v)) # print("Total number of models: %s"%sum(info.values())) return pdb_structures
def build_matrix( path: str, filename: str, truncate_log: Union[tqdm.tqdm, None] = None) -> BuildMatrixDict: """Build the input matrix for one protein. Args: path: path of the pdb file. filename: name of the file (without extension). truncate_log: tqdm logger Returns: Build matrix dictionary """ PROTEIN_SEQ_MAX_LEN = 4000 protein_matrix = [[0 for x in range(PROTEIN_SEQ_MAX_LEN)] for y in range(10)] protein_structure = PDBParser().get_structure(filename, path) protein_model = list(protein_structure.get_models()) protein_chains = list(protein_model[0].get_chains()) col = 0 try: for chain in protein_chains: protein_residues = list(chain.get_residues()) for residue in protein_residues: if Polypeptide.is_aa(residue.get_resname(), standard=True): atoms = list(residue.get_atoms()) x = [] y = [] z = [] for atom in atoms: vec = atom.get_vector() x.append(vec.__getitem__(0)) y.append(vec.__getitem__(1)) z.append(vec.__getitem__(2)) # calculate position of residue x = round(mean(x)) y = round(mean(y)) z = round(mean(z)) # one letter code code = Polypeptide.three_to_one(residue.get_resname()) aa = amino_acid[code] protein_matrix[0][col] = aa["code"] protein_matrix[1][col] = x protein_matrix[2][col] = y protein_matrix[3][col] = z protein_matrix[4][col] = aa["hydropathy"] protein_matrix[5][col] = aa["hydropathy_index"] protein_matrix[6][col] = aa["acidity_basicity"] protein_matrix[7][col] = aa["mass"] protein_matrix[8][col] = aa["isoelectric_point"] protein_matrix[9][col] = aa["charge"] # Even if the current residue is not amino acid we increase the col. # 0 is save at this position if it is not an amino acid. col = col + 1 except IndexError: if truncate_log is not None: truncate_log.set_description_str( f"Protein {filename} is truncated.") # Prepare dict so it can be load to vaex dataframe dic: BuildMatrixDict = { "seq": [[]], "x_pos": [[]], "y_pos": [[]], "z_pos": [[]], "hydropathy": [[]], "hydropathy_index": [[]], "acidity_basicity": [[]], "mass": [[]], "isoelectric_point": [[]], "charge": [[]], } for i in range(10): dic[col_name[i]] = pyarrow.array( [[protein_matrix[i][x] for x in range(PROTEIN_SEQ_MAX_LEN)]]) return dic