def parse_structure(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = {} self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": resname = polypeptide.three_to_one('HIS') else: if res.resname not in self.residue_list: continue self.residues[chain.id][res.id[1]] = MappedResidue( res.id[1], polypeptide.three_to_one(res.resname)) self.pdb_seq[chain.id] = ''.join([ self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys()) ]) for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1): self.residues[chain.id][res].pos_in_aln = pos
def parse_pdb(self): pdb_struct = None #checking for file handle or file name to parse if self.pdb_file: pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_file)[0] elif self.pdb_filename: pdb_struct = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_filename)[0] else: return None #extracting sequence and preparing dictionary of residues #bio.pdb reads pdb in the following cascade: model->chain->residue->atom for chain in pdb_struct: self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS') else: try: self.pdb_seq[chain.id] += polypeptide.three_to_one( res.resname) except Exception as msg: continue return pdb_struct
def select_ref_atoms(self, fragment, ref_pdbio_struct, use_similar=False): for chain in ref_pdbio_struct: for res in chain: try: gn = self.get_generic_number(res) if gn == fragment.rotamer.residue.display_generic_number.label: logger.info("Ref {}:{}\tFragment {}:{}".format( polypeptide.three_to_one(res.resname), self.get_generic_number(res), fragment.rotamer.residue.amino_acid, fragment. rotamer.residue.display_generic_number.label)) if use_similar: for rule in self.similarity_rules: if polypeptide.three_to_one( res.resname ) in rule[self.similarity_dict[ "target_residue"]] and fragment.rotamer.residue.amino_acid in rule[ self.similarity_dict[ "target_residue"]] and fragment.interaction_type.slug in rule[ self.similarity_dict[ "interaction_type"]]: return [res['CA'], res['N'], res['O']] else: return [res['CA'], res['N'], res['O']] except Exception as msg: continue return []
def get_adjacency_matrix(pdb_id, pdb_file): parser = PDBParser() # initialize biopython PDB parser structure = parser.get_structure(pdb_id, pdb_file) # get PDB parsed by providing id and file name # deriving all amino acids based on presence of beta carbon amino_acids = [res for res in structure[0]['A'] if 'CB' in res] # set up df based on num. of amino acids. All amino acid pair interaction values will be appended. adj_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids))) for i, r1 in enumerate(amino_acids): for j, r2 in enumerate(amino_acids): if i != j: # looking through all non-self AA interactions distance = r1['CB'] - r2['CB'] # distance in Angstrom, 3D space between beta carbons on 2 amino acids # if 3D distance < 8 Angstrom, then 3D contact is assumed. # Adjancency matrix has a 1 for amino acids with 3D contact (8 A limit) and 0 for not. if distance <= 8: adj_df_values[i][j] = 1.0 else: adj_df_values[i][j] = 0 else: adj_df_values[i][j] = 0 # df with rows and cols having aa name and position; values from appended df adjacency_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids], columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids], data=adj_df_values) return adjacency_df
def get_distance_matrix(pdb_id, pdb_file): parser = PDBParser() # initialize biopython PDB parser structure = parser.get_structure(pdb_id, pdb_file) # get PDB parsed by providing id and file name # deriving all amino acids based on presence of beta carbon amino_acids = [res for res in structure[0]['A'] if 'CB' in res] # set up df based on num. of amino acids. All amino acid pair interaction values will be appended. dist_df_values = np.zeros(shape=(len(amino_acids), len(amino_acids))) for i, r1 in enumerate(amino_acids): for j, r2 in enumerate(amino_acids): if i != j: # looking through all non-self AA interactions dist = r1['CB'] - r2['CB'] # distance in Angstrom, 3D space between beta carbons on 2 amino acids dist_df_values[i][j] = dist # distance matrix just has 3D distance values. No cutoff required. else: dist_df_values[i][j] = 0 # df with rows and cols having aa name and position; values from appended df distance_df = pd.DataFrame(index=[polypep.three_to_one(r.get_resname()) for r in amino_acids], columns=[polypep.three_to_one(r.get_resname()) for r in amino_acids], data=dist_df_values) return distance_df
def parse_pdb (self): pdb_struct = None #checking for file handle or file name to parse if self.pdb_file: pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_file)[0] elif self.pdb_filename: pdb_struct = PDBParser(PERMISSIVE=True).get_structure('ref', self.pdb_filename)[0] else: return None #extracting sequence and preparing dictionary of residues #bio.pdb reads pdb in the following cascade: model->chain->residue->atom for chain in pdb_struct: self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": self.pdb_seq[chain.id] += polypeptide.three_to_one('HIS') else: try: self.pdb_seq[chain.id] += polypeptide.three_to_one(res.resname) except Exception as msg: continue return pdb_struct
def MutationsDict(file, positions=None): """Get dictionary with lists of mutations per position in protein, ignore positions without residue in pdb file. Parameters: file (string): pdb file to get mutations from positions: list of tuples of the form (chain, first, last) for positions to mutate for all other aminoacids. If None, mutates all positions in all chains Returns: dict with keys :aa:chain:position, each containing lists with :aa:chain:position:mutated_aa for all mutations """ # Sorted list of one letter amino acids AA = list(Bio.PDB.Polypeptide.aa1) # Generate model of original pdb file model = it.Pmolecule(file).model # Dict to store mutations mutations = dict() if positions: for chain_id, first, last in positions: # Get chain corresponding to chain_id given chain = next(chain for chain in model.get_chains() if chain.id == chain_id) for residue in chain: if pp.is_aa(residue): code = pp.three_to_one(residue.get_resname()) position = residue.id[1] prefix = code + chain_id + str(position) # Only save positions between first and last if position in range(first, last + 1): mutations[prefix] = [ prefix + aa for aa in AA if aa != code ] else: for chain in model.get_chains(): for residue in chain: if pp.is_aa(residue): code = pp.three_to_one(residue.get_resname()) position = residue.id[1] chain_id = chain.id prefix = code + chain_id + str(position) mutations[prefix] = [ prefix + aa for aa in AA if aa != code ] return mutations
def get_missing_sidechains(pdb_dataset, output_scwrl): """Get residues that are missing atoms.""" for pdb_filename in db.get_structures_filenames(pdb_dataset): biopy_structure = db.parse_biopython_structure(pdb_filename) pdb_name = db.get_pdb_name(pdb_filename) missing = 0 scwrl_list = [] logging.info("Processing {:}".format(pdb_name)) for model in biopy_structure: for chain in model: for i, residue in enumerate(chain): res_name = residue.resname if res_name not in expected: logging.warning("Non-standard residue found: {:}. " "Skipping.".format(res_name)) continue res_code = poly.three_to_one(res_name) res_id = residue.id[1] curr_count = len( Bio.PDB.Selection.unfold_entities(residue, 'A')) if curr_count != expected[res_name]: logging.debug( "Missing residue {:} at position {:} (with id {:})" " which has {:} instead of the expected {:} atoms." .format(res_name, i, res_id, curr_count, expected[res_name])) missing += 1 scwrl_list.append(res_code.upper()) else: scwrl_list.append(res_code.lower()) logging.debug("Missing {:} residue total".format(missing)) with open(output_scwrl, 'w') as f: f.write("".join(scwrl_list))
def modeller_get_chain_seqs(target_protein, target_chain, version): target_path = path.join(PATHS.modeller, target_protein + target_chain) target_pdb_fname = 'v%s_pdb' % version + target_protein + '.ent' pdb_file_path = path.join(target_path, target_pdb_fname) if not path.isfile(pdb_file_path): LOGGER.warning('File %s not found' % pdb_file_path) return None, None parser = PDBParser(PERMISSIVE=1, QUIET=True) structure_id = path.basename(target_pdb_fname).split('.')[0] try: structure = parser.get_structure(structure_id, pdb_file_path) except: print( "ERROR: failed parser.get_structure(structure_id, pdb_fname) for " + target_pdb_fname) return None model = structure[0] try: chain = model[target_chain] except KeyError: return None chain_lst = [] for res in chain.get_residues(): if is_aa(res) and res.get_id()[0] == ' ': if res.resname == 'UNK' or res.resname == 'ASX': chain_lst.append('-') elif res.resname == 'SEC': chain_lst.append('U') else: chain_lst.append(Polypeptide.three_to_one(res.resname)) return chain_lst, chain
def get_sequence(self, chain_id): """ Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. """ from Bio.PDB import Polypeptide chain = self.structure.child_list[0].child_dict[chain_id] # print(chain.child_list) sequence_list = [] for residue in chain.child_list: try: poly_short = Polypeptide.three_to_one(residue.resname) except KeyError: # probably the start of only HOH -> discard rest # print(poly_short) break # print(poly_short) sequence_list.append(poly_short) sequence = ''.join(sequence_list) return sequence
def residue_seq_to_one(seq): """ Standard mapping from 3-letters amino acid type encoding to one. """ three_to_one = lambda r: Polypeptide.three_to_one(r.name)\ if r.name in Polypeptide.standard_aa_names else 'U' return list(map(three_to_one, seq))
def select_ref_atoms (self, fragment, ref_pdbio_struct, use_similar=False): for chain in ref_pdbio_struct: for res in chain: try: gn = self.get_generic_number(res) if gn == fragment.rotamer.residue.display_generic_number.label: logger.info("Ref {}:{}\tFragment {}:{}".format(polypeptide.three_to_one(res.resname), self.get_generic_number(res), fragment.rotamer.residue.amino_acid, fragment.rotamer.residue.display_generic_number.label)) if use_similar: for rule in self.similarity_rules: if polypeptide.three_to_one(res.resname) in rule[self.similarity_dict["target_residue"]] and fragment.rotamer.residue.amino_acid in rule[self.similarity_dict["target_residue"]] and fragment.interaction_type.slug in rule[self.similarity_dict["interaction_type"]]: return [res['CA'], res['N'], res['O']] else: return [res['CA'], res['N'], res['O']] except Exception as msg: continue return []
def get_chain_sequence(self, chain): """ Returns a sequence string of a given chain. """ return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list ])
def get_peptide_sequence(self, residues): """ Returns a sequence string of a given list of Bio.PDB.Residue objects. """ return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list ])
def constructor(self, recalculate): chain_obj = global_stuff.the_obj_manager.get_variable(pdb_chain_wrapper(self.params), recalculate) # write the seq file at location + name raw_seq_string = ''.join([Polypeptide.three_to_one(res.resname) for res in chain_obj]) seq = Bio.Seq.Seq(raw_seq_string) seq_record = Bio.SeqRecord.SeqRecord(seq) SeqIO.write(seq_record, self.get_file_location(), 'fasta') return open(self.get_file_location(),'r')
def getClearPeptideSeq(self): if not self.__peptide: self.printerr('getClearPeptideSeq(): PEPTIDE (' + self.__name +') IS EMPTY\n') return 0 s = '' for r in list(self.__peptide): s = s + Polypeptide.three_to_one(r.get_resname()) return s
def parse_structure(self): for residue in self.structure.get_residues(): if PDB.is_aa(residue, standard=True): #only consider standard 20 residues res = residue.id[1] if res not in self.residues: #dont doublecount mutated residues (ex. 1ORC) self.residues.append(res) self.d_sequence[res] = Polypeptide.three_to_one( Residue.Residue.get_resname(residue))
def calcDistMatrices(self, key1, key2): # calculate and store distance matrix to self.__d_matrices for a pair of regions res_list_1 = self.getRegion(key1) # key1 and key2 refer to keys of self.__regions_res dictionary res_list_2 = self.getRegion(key2) if not res_list_1 or not res_list_2: self.printerr('calcDistMatrices(): RESIDUE LIST IS EMPTY\n') return 0 values = [] for res1 in res_list_1: values.append([]) for res2 in res_list_2: values[len(values)-1].append(residuesMinDist(res1, res2)) rows = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_1] cols = [Polypeptide.three_to_one(x.get_resname()) for x in res_list_2] mat = pd.DataFrame(values, index = rows, columns = cols) self.__d_matrices.update({(key1, key2): mat}) return 1
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' sequence = self.get_amino_residues(chain_id) return ''.join( Polypeptide.three_to_one(res.get_resname()) for res in sequence)
def pdb2seq(pdbname): import Bio.PDB.Polypeptide as bio seq = "" with open(pdbname, "r") as pdb: prev_n, n = 0, 0 for line in pdb: line = line.strip("\n") if line[:4] == "ATOM": n = int(line[23:26]) if n != prev_n: aa = line[17:20] seq += bio.three_to_one(aa) prev_n = n return (seq)
def create_structure_rotamer(PDB_residue, residue_object, structure): out_stream = StringIO() io = PDBIO() # print(PDB_residue) io.set_structure(PDB_residue) io.save(out_stream) pdbdata = PdbData.objects.get_or_create(pdb=out_stream.getvalue())[0] missing_atoms = atom_num_dict[Polypeptide.three_to_one( PDB_residue.get_resname())] > len(PDB_residue.get_unpacked_list()) rot = Rotamer(missing_atoms=missing_atoms, pdbdata=pdbdata, residue=residue_object, structure=structure) return rot
def write_FASTAs(PDB_ID, chains): polypeptide_IDs = [] for chain_ID, residues in chains.items(): if residues and Polypeptide.is_aa(residues[0][0]): polypeptide_ID = '{}_{}'.format(PDB_ID, chain_ID) polypeptide_IDs.append(polypeptide_ID) sequence = [] for resname, resseq, icode in residues: try: sequence.append(Polypeptide.three_to_one(resname)) except KeyError: sequence.append('X') with open('{}.fasta'.format(polypeptide_ID), mode='w') as f: f.write('>{}\n'.format(polypeptide_ID)) f.write('{}\n'.format(''.join(sequence))) return polypeptide_IDs
def get_sequence( self, chain_id ): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' chain = self.structure.child_list[0].child_dict[chain_id] sequence = "" for residue in chain: if Polypeptide.is_aa(residue): long_name = residue.get_resname() sequence += Polypeptide.three_to_one(long_name) return sequence
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' chains = list(self.structure.get_chains()) for x in chains: if x.id == chain_id: ret = "" for res in x.get_unpacked_list(): if res.resname != 'HOH': ret += PP.three_to_one(res.resname) return ret return None
def residue_list_to_string(residues, with_ids=False): """Convert list of residues to string.""" for residue in residues: if residue.resname == 'HID': residue.resname = 'HIS' elif residue.resname == 'CYX': residue.resname = 'CYS' elif residue.resname == 'ASX': residue.resname = 'ASP' elif residue.resname == 'GLX': residue.resname = 'GLY' seq = [poly.three_to_one(residue.resname) for residue in residues if residue.resname != 'SEC' and residue.resname != 'PYL'] ids = [residue.residue for residue in residues if residue.resname != 'SEC' and residue.resname != 'PYL'] if with_ids: return "".join(seq), ids else: return "".join(seq)
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' sequence = '' for model in self.structure: chain = model[chain_id] residues = chain.get_residues() for residue in residues: try: sequence = sequence + pp.three_to_one( residue.get_resname()) except: continue return sequence
def extract_seqs(structure, defmodel): ''' Uses Biopython to count the numer of chains and to extract the each chain's sequence as a list of sequences. Called by: clean_and_sort() ''' nchains = 0 for model in structure: if model.id == defmodel: seqs = [] chain_ids = [] for chain in model: nchains += 1 seqlist = [] for residue in chain: if bpp_poly.is_aa(residue.get_resname(), standard=True): seqlist.append( bpp_poly.three_to_one(residue.get_resname())) else: seqlist.append('X') seq = str("".join(seqlist)) seqs.append(seq) chain_ids.append(chain.id) return nchains, seqs, chain_ids
def get_structure_seqrecords(model): """Get a dictionary of a PDB file's sequences. Special cases include: - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR - HETATMs. Currently written as an "X", or unknown amino acid. Args: model: Biopython Model object of a Structure Returns: list: List of SeqRecords """ structure_seq_records = [] # Loop over each chain of the PDB for chain in model: tracker = 0 chain_seq = '' chain_resnums = [] # Loop over the residues for res in chain.get_residues(): # NOTE: you can get the residue number too res_id = res.id res_num = res_id[1] res_icode = res_id[2] # Double check if the residue name is a standard residue # If it is not a standard residue (ie. selenomethionine), # it will be filled in with an X on the next iteration) if Polypeptide.is_aa(res, standard=True): end_tracker = res_num res_aa_one = Polypeptide.three_to_one(res.get_resname()) # Tracker to fill in X's if end_tracker != (tracker + 1): if res_icode != ' ': chain_seq += res_aa_one chain_resnums.append(res_num) tracker = end_tracker + 1 continue else: multiplier = (end_tracker - tracker - 1) chain_seq += 'X' * multiplier # Residue numbers for unresolved or nonstandard residues are Infinite chain_resnums.extend([float("Inf")] * multiplier) chain_seq += res_aa_one chain_resnums.append(res_num) tracker = end_tracker else: continue chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein), id=chain.get_id()) chain_seq_record.letter_annotations[ 'structure_resnums'] = chain_resnums structure_seq_records.append(chain_seq_record) return structure_seq_records
def build_matrix( path: str, filename: str, truncate_log: Union[tqdm.tqdm, None] = None) -> BuildMatrixDict: """Build the input matrix for one protein. Args: path: path of the pdb file. filename: name of the file (without extension). truncate_log: tqdm logger Returns: Build matrix dictionary """ PROTEIN_SEQ_MAX_LEN = 4000 protein_matrix = [[0 for x in range(PROTEIN_SEQ_MAX_LEN)] for y in range(10)] protein_structure = PDBParser().get_structure(filename, path) protein_model = list(protein_structure.get_models()) protein_chains = list(protein_model[0].get_chains()) col = 0 try: for chain in protein_chains: protein_residues = list(chain.get_residues()) for residue in protein_residues: if Polypeptide.is_aa(residue.get_resname(), standard=True): atoms = list(residue.get_atoms()) x = [] y = [] z = [] for atom in atoms: vec = atom.get_vector() x.append(vec.__getitem__(0)) y.append(vec.__getitem__(1)) z.append(vec.__getitem__(2)) # calculate position of residue x = round(mean(x)) y = round(mean(y)) z = round(mean(z)) # one letter code code = Polypeptide.three_to_one(residue.get_resname()) aa = amino_acid[code] protein_matrix[0][col] = aa["code"] protein_matrix[1][col] = x protein_matrix[2][col] = y protein_matrix[3][col] = z protein_matrix[4][col] = aa["hydropathy"] protein_matrix[5][col] = aa["hydropathy_index"] protein_matrix[6][col] = aa["acidity_basicity"] protein_matrix[7][col] = aa["mass"] protein_matrix[8][col] = aa["isoelectric_point"] protein_matrix[9][col] = aa["charge"] # Even if the current residue is not amino acid we increase the col. # 0 is save at this position if it is not an amino acid. col = col + 1 except IndexError: if truncate_log is not None: truncate_log.set_description_str( f"Protein {filename} is truncated.") # Prepare dict so it can be load to vaex dataframe dic: BuildMatrixDict = { "seq": [[]], "x_pos": [[]], "y_pos": [[]], "z_pos": [[]], "hydropathy": [[]], "hydropathy_index": [[]], "acidity_basicity": [[]], "mass": [[]], "isoelectric_point": [[]], "charge": [[]], } for i in range(10): dic[col_name[i]] = pyarrow.array( [[protein_matrix[i][x] for x in range(PROTEIN_SEQ_MAX_LEN)]]) return dic
def get_structure_seqs(pdb_file, file_type): """Get a dictionary of a PDB file's sequences. Special cases include: - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR - HETATMs. Currently written as an "X", or unknown amino acid. Args: pdb_file: Path to PDB file Returns: dict: Dictionary of: {chain_id: sequence} """ # TODO: Please check out capitalization of chain IDs in mmcif files. example: 5afi - chain "l" is present but # it seems like biopython capitalizes it to chain L # Get the first model my_structure = StructureIO(pdb_file) model = my_structure.first_model structure_seqs = {} # Loop over each chain of the PDB for chain in model: chain_seq = '' tracker = 0 # Loop over the residues for res in chain.get_residues(): # NOTE: you can get the residue number too # res_num = res.id[1] # Double check if the residue name is a standard residue # If it is not a standard residue (ie. selenomethionine), # it will be filled in with an X on the next iteration) if Polypeptide.is_aa(res, standard=True): full_id = res.get_full_id() end_tracker = full_id[3][1] i_code = full_id[3][2] aa = Polypeptide.three_to_one(res.get_resname()) # Tracker to fill in X's if end_tracker != (tracker + 1): if i_code != ' ': chain_seq += aa tracker = end_tracker + 1 continue else: chain_seq += 'X' * (end_tracker - tracker - 1) chain_seq += aa tracker = end_tracker else: continue structure_seqs[chain.get_id()] = chain_seq return structure_seqs
def create_residues(self, args): schemes = { 'gpcrdb': {'type': False}, 'gpcrdba': { 'type': 'structure', 'seq_based': 'bw', }, 'gpcrdbb': { 'type': 'structure', 'seq_based': 'woot', }, 'gpcrdbc': { 'type': 'structure', 'seq_based': 'pin', }, 'gpcrdbf': { 'type': 'structure', 'seq_based': 'wang', }, 'bw': {'type': 'sequence'}, 'woot': {'type': 'sequence'}, 'pin': {'type': 'sequence'}, 'wang': {'type': 'sequence'}, } for scheme_name, scheme in schemes.items(): schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name) mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt']) if os.path.isfile(mapping_file): with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file: schemes[scheme_name]['table'] = {} for row in scheme_table_file: split_row = shlex.split(row) schemes[scheme_name]['table'][split_row[0]] = split_row[1] missing_proteins = [] self.logger.info('CREATING RESIDUES') for arg in args: if os.path.exists(os.sep.join([self.dump_source_dir, arg])): residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r') self.logger.info('Parsing residue data from {}'.format(arg)) else: print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg]))) self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg]))) continue for line in residue_data_fh: id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug... if prot_name in missing_proteins: continue # fetch schemes and conversion tables #Checking if the protein exists in the db try: pconf = ProteinConformation.objects.get(protein__entry_name=prot_name, state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist as e: missing_proteins.append(prot_name) continue #Checking if given residue already exists in the db try: Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num) continue except Residue.DoesNotExist as e: pass r = Residue() r.protein_conformation = pconf r.sequence_number = int(res_num) r.amino_acid = polypeptide.three_to_one(res_name.upper()) generic_numbers = [] try: r.save() self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number, r.amino_acid, pconf.protein.entry_name)) except Exception as msg: print(msg) self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format( r.sequence_number, r.amino_acid, pconf.protein.entry_name)) continue # residue segment dump_segment = sec_str_name try: r.protein_segment = ProteinSegment.objects.get(slug=dump_segment) except: self.logger.error('Failed to fetch protein segment {}'.format(dump_segment)) # generic number if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'): # separate bulge number (1241 - > 124 + 1) bulge_prime = '' dump_oliveira = str(oli) if len(dump_oliveira) == 4: bulge_prime = dump_oliveira[3] dump_oliveira = dump_oliveira[:3] dump_gpcrdb = gpcrdb[:4] dump_seq_based = bw # default gpcrdb number def_gpcrdb = False if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']: default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + bulge_prime) try: def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label, scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']) except ResidueGenericNumber.DoesNotExist as e: def_gpcrdb = ResidueGenericNumber() def_gpcrdb.label = default_label def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'] def_gpcrdb.protein_segment = r.protein_segment def_gpcrdb.save() self.logger.info('Created generic number {:s} in numbering scheme {:s}' .format(default_label, schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name)) # if default number was found/added successfully, process the alternative numbers if def_gpcrdb: # add default generic number to residue record r.generic_number = def_gpcrdb # dict of sequence-based numbers, for use in structure-based numbers (5.46x461) seq_based_labels = {} # sequence-based schemes first (the sequence-based numbers are needed for the # structure based schemes) for scheme_name, scheme in schemes.items(): if scheme['type'] == 'sequence': # is this number in the scheme defined for this protein? if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']: seq_based_label = dump_seq_based # if not convert the number to the correct scheme else: slug = pconf.protein.residue_numbering_scheme.slug for d, c in schemes[schemes[slug]['seq_based']]['table'].items(): if c == dump_seq_based: seq_based_label = scheme['table'][d] break # fetch/insert the number try: seq_based = ResidueGenericNumber.objects.get(label=seq_based_label, scheme=scheme['obj']) except ResidueGenericNumber.DoesNotExist as e: seq_based = ResidueGenericNumber() seq_based.label = seq_based_label seq_based.scheme = scheme['obj'] seq_based.protein_segment = r.protein_segment seq_based.save() r.alternative_generic_numbers.add(seq_based) # add added number to the dict for later use seq_based_labels[scheme_name] = seq_based_label # structure-based numbers for scheme_name, scheme in schemes.items(): if scheme['type'] == 'structure': # is this number in the scheme defined for this protein? if scheme_name == pconf.protein.residue_numbering_scheme.slug: struct_based_label = dump_gpcrdb + bulge_prime # if not convert the number to the correct scheme else: for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items(): if c == dump_gpcrdb: struct_based_label = scheme['table'][d] + bulge_prime break # add the sequence-based label (5x461 -> 5.46x461) split_struct_based_label = struct_based_label.split('x') struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' + split_struct_based_label[1]) # fetch/insert the number try: struct_based = ResidueGenericNumber.objects.get( label=struct_based_label, scheme=scheme['obj']) except ResidueGenericNumber.DoesNotExist as e: struct_based = ResidueGenericNumber() struct_based.label = struct_based_label struct_based.scheme = scheme['obj'] struct_based.protein_segment = r.protein_segment struct_based.save() # add to residue as a display number or alternative number? if scheme_name == pconf.protein.residue_numbering_scheme.slug: r.display_generic_number = struct_based else: r.alternative_generic_numbers.add(struct_based) try: r.save() self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num, res_name, pconf.protein.entry_name)) except Exception as msg: print(msg) self.logger.error( 'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num, res_name, pconf.protein.entry_name)) self.logger.info('COMPLETED CREATING RESIDUES')
def upload(request): # Processo Pipeline concluido - Falta somente pegar o melhor modelo gerado. # Rodrigo 27/07/2020. if request.method == 'POST': #diretorio = "" proteina = request.FILES['proteina'] template = request.FILES['documento'] # Cadeia da Sequencia cadeiaS = request.POST.get('cadeiaS').upper() #Cadeia do Template cadeiaT = request.POST.get('cadeiaT').upper() fs = FileSystemStorage() fs.save(proteina.name, proteina) fs.save(template.name, template) w = open("media\\"+ template.name +".fasta","w") w.write(">"+template.name+"\n") cadeia = cadeiaT # Aqui eu estou deixando Fixo "A", mas no caso não é o correto, deveria verificar uma forma de identificar. Cadeia Template * Update 23/09 comeco = 0 fim = 0 pdb = open('media\\' + template.name).readlines() for linha in pdb: if linha[0:4] == "ATOM" and linha[21] == cadeia and linha[13:15] == 'CA': # ver o que é CA resname3 = linha[17:20] if comeco == 0: comeco = int(linha[22:26]) if int(linha[22:26]) > fim: fim = int(linha[22:26]) resname1 = Polypeptide.three_to_one(resname3) w.write(resname1) w.write("\n") w.close() os.system("type media\\"+template.name+".fasta > media\\alinha.fasta") os.system("type media\\"+proteina.name+" >> media\\alinha.fasta") # run clustal-w os.system("clustalw2 -infile=media\\alinha.fasta -output=pir") #subprocess.call(["clustalw2.exe", "-infile='media\\alinha.fasta' -output='pir'"]) aln = open("media\\alinha.pir").readlines() new_aln = open("media\\new_alinha.pir","w") tipo = 0 #0 = PDB; 1 = SEQ seq = open("media\\" + proteina.name) seq_final = "" for linha in seq: if linha[0] != ">": seq_final += linha.strip() tamanho_seq = len(seq_final) print("tamanho da seq = "+str(tamanho_seq)) for linha in aln: if linha[0] == ">": if tipo == 0 and linha != "\n": new_aln.write(">P1;"+template.name+"\n") new_aln.write("structure:"+template.name+":"+str(comeco)+":"+cadeia+":"+str(fim)+":"+cadeia+"::::\n") tipo = tipo+1 elif tipo == 1: new_aln.write(">P1;"+proteina.name+"\n") new_aln.write("sequence:"+proteina.name+":"+str(1)+":"+str(cadeiaS)+":"+str(tamanho_seq)+":"+str(cadeiaS)+"::::") # Mesma coisa aqui, porém aqui é a Cadeia da Sequencia que vamos modelar! else: new_aln.write(linha) new_aln.close() seq.close() # **************************************************************************************** # modeller # **************************************************************************************** criaScript(proteina, template) os.system("media\\Modeller.lnk") # Atualizar o arquivo.bat dinamico para ir de acordo com a pasta atualizaArquivoBAT(proteina.name) max_id = inserirDiretorio(proteina.name) os.system("media\\clear.lnk") inserirArquivos(proteina.name, max_id) d = Banco() diretorios = d.getDiretorios() print(diretorios[0][0]) # result = subprocess.run(['dir', '*.py'], stdout=subprocess.PIPE) # result.stdout # print(result.stdout) return render(request, 'pipeline/upload.html', {'resultado': '1', 'diretorios': diretorios[0:] } ) # Falta aqui! d = Banco() diretorios = d.getDiretorios() #print(diretorios[0][0]) # listaUL = '<ul class="prot-list">' # contador = 0 # for dire in diretorios: # listaUL += '<li class="prot-item"><span><a href="">'+ dire[contador][contador] +'</a></span></li>' # contador += 1 # listaUL += '</ul>' return render(request, 'pipeline/upload.html')
def handle(self, *args, **options): startTime = datetime.datetime.now() self.options = options if self.options["purge"]: Residue.objects.filter( protein_conformation__protein__entry_name__endswith="_a", protein_conformation__protein__family__parent__parent__name= "Alpha").delete() ProteinConformation.objects.filter( protein__entry_name__endswith="_a", protein__family__parent__parent__name="Alpha").delete() Protein.objects.filter( entry_name__endswith="_a", family__parent__parent__name="Alpha").delete() SignprotStructureExtraProteins.objects.all().delete() SignprotStructure.objects.all().delete() if not options["only_signprot_structures"]: # Building protein and protconf objects for g protein structure in complex if options["s"]: scs = SignprotComplex.objects.filter( structure__pdb_code__index__in=[ i.upper() for i in options["s"] ]) else: scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} is building" .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + "_a") except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + "_a" alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + "_a" alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug="mod") alpha_protein.source = ProteinSource.objects.get( name="OTHER") alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index. lower() + "_a") except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug="active") alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure("struct", StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: if "CA" in res and res.id[0] == " ": nums.append(res.get_id()[1]) resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = "" pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == "6OIJ" and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [ chain[n], resis.get(sequence_number=nr) ] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split("\n") seqadv = [] for l in pdb_lines: if l.startswith("SEQADV"): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)", s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group( 3) == line_search.group(6): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group( 3 ) != line_search.group( 6 ) and "CONFLICT" in line_search.group(7): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) if options["debug"]: print(sc) print(mutations) print(shifted_mutations) print(mismatches) print("======") print(remaining_mismatches) pprint.pprint(pdb_num_dict) no_seqnum_shift = [ '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U', '7L1V' ] # Check if HN is mutated to GNAI1 for the scFv16 stabilizer if sc.protein.entry_name != 'gnai1_human' and len( remaining_mismatches) > 0: target_HN = resis.filter(protein_segment__slug='HN') gnai1_HN = Residue.objects.filter( protein_conformation__protein__entry_name= 'gnai1_human', protein_segment__slug='HN') pdb_HN_seq = '' for num, val in pdb_num_dict.items(): if num <= target_HN.reverse()[0].sequence_number: pdb_HN_seq += Polypeptide.three_to_one( val[0].get_resname()) if options['debug']: print('Checking if HN is gnai1_human') print(pdb_HN_seq) print(''.join( gnai1_HN.values_list('amino_acid', flat=True))) gnai1_HN_seq = ''.join( gnai1_HN.values_list('amino_acid', flat=True)) pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq, 3, -4, -3, -1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) length, match = 0, 0 for r, t in zip(ref_seq, temp_seq): if options['debug']: print(r, t) if t != '-': if r == t: match += 1 length += 1 identity = match / length * 100 if options['debug']: print(identity) if identity > 85: if sc.structure.pdb_code.index not in ['7DFL']: no_seqnum_shift.append( sc.structure.pdb_code.index) if options['debug']: print( 'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction' .format(round(identity))) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len( remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift: ppb = PPBuilder() seq = "" for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) if sc.structure.pdb_code.index in [ '7JVQ', '7L1U', '7L1V' ]: pw2 = pairwise2.align.localms( sc.protein.sequence, seq, 3, -4, -3, -1) else: pw2 = pairwise2.align.localms( sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) # Custom fix for A->G mutation at pos 18 if sc.structure.pdb_code.index == '7JJO': ref_seq = ref_seq[:18] + ref_seq[19:] temp_seq = temp_seq[:17] + temp_seq[18:] # Custom alignment fixes elif sc.structure.pdb_code.index == '7DFL': ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' elif sc.structure.pdb_code.index == '7JOZ': temp_seq = temp_seq[:67] + ( '-' * 14) + 'FNGDS' + temp_seq[86:] elif sc.structure.pdb_code.index == '7AUE': ref_seq = ref_seq[:31].replace('-', '') + ref_seq[31:] temp_seq = ( 9 * '-') + temp_seq[2:5] + temp_seq[5:54].replace( '-', '') + temp_seq[54:] wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): if options["debug"]: print(i, ref, temp) # alignment check if ref != "-" and temp != "-": wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]] [0]] = resis[j] j += 1 k += 1 elif ref == "-": wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == "-": wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 # Custom fix for 7JJO isoform difference if sc.structure.pdb_code.index in [ '7JJO', '7JOZ', '7AUE' ]: pdb_num_dict = OrderedDict() for wt_res, st_res in wt_pdb_dict.items(): if type(st_res) == type([]): pdb_num_dict[wt_res.sequence_number] = [ st_res[0], wt_res ] else: for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[ r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id() [1]][1] = pdb_wt_dict[r[0]] if sc.structure.pdb_code.index == '7JVQ': pdb_num_dict[198][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=346) pdb_num_dict[235][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=383) elif sc.structure.pdb_code.index == '6PB0': pdb_num_dict[205][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=205) ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera elif sc.structure.pdb_code.index == "6WHA": ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV" temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV" pdb_num_dict = OrderedDict() temp_resis = [res for res in chain] temp_i = 0 mapped_cgns = [] for i, aa in enumerate(temp_seq): if aa != "-": ref_split_on_gaps = ref_seq[:i + 1].split("-") ref_seqnum = i - (len(ref_split_on_gaps) - 1) + 1 res = resis.get(sequence_number=ref_seqnum) if res.display_generic_number.label in mapped_cgns: next_presumed_cgn = self.get_next_presumed_cgn( res) if next_presumed_cgn: res = next_presumed_cgn while res and res.display_generic_number.label in mapped_cgns: res = self.get_next_presumed_cgn( res) else: print( "Error: {} CGN does not exist. Incorrect mapping of {} in {}" .format(next_presumed_cgn, chain[nums[temp_i]], sc.structure)) mapped_cgns.append( res.display_generic_number.label) pdb_num_dict[nums[temp_i]] = [ chain[nums[temp_i]], res ] temp_i += 1 bulked_rotamers = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check if not isinstance(val[1], int): res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment res_obj.save() rot = self.create_structure_rotamer( val[0], res_obj, sc.structure) bulked_rotamers.append(rot) else: self.logger.info( "Skipped {} as no annotation was present, while building for alpha subunit of {}" .format(val[1], sc)) if options["debug"]: pprint.pprint(pdb_num_dict) Rotamer.objects.bulk_create(bulked_rotamers) self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished" .format(sc)) except Exception as msg: if options["debug"]: print("Error: ", sc, msg) self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed" .format(sc)) if not options["s"]: ### Build SignprotStructure objects from non-complex signprots g_prot_alphas = Protein.objects.filter( family__slug__startswith="100_001", accession__isnull=False) #.filter(entry_name="gnai1_human") complex_structures = SignprotComplex.objects.all().values_list( "structure__pdb_code__index", flat=True) for a in g_prot_alphas: pdb_list = get_pdb_ids(a.accession) for pdb in pdb_list: if pdb not in complex_structures: try: data = self.fetch_gprot_data(pdb, a) if data: self.build_g_prot_struct(a, pdb, data) except Exception as msg: self.logger.error( "SignprotStructure of {} {} failed\n{}: {}". format(a.entry_name, pdb, type(msg), msg)) if options["debug"]: print(datetime.datetime.now() - startTime)
return ppl if __name__ == '__main__': pdb = argv[1] cha = argv[2] output = open(argv[1]+".angles","w") for model in Bio.PDB.PDBParser().get_structure(pdb, pdb+".pdb"): chain = model[cha] output.write("##### PDB "+pdb+" Chain "+chain.get_id()+"\n" ) polypeptides = Bio.PDB.PPBuilder().build_peptides(chain) for poly_index, poly in enumerate(polypeptides) : chain_angle = [] for j, angles_list in enumerate(get_angle_list(poly)): torsion_angles = [] resseq = poly[j].id[1] for i in range(len(angles_list)): try: torsion_angles.append(float(angles_list[i])*57.2957795) except: torsion_angles.append("-999") output.write(str(resseq)+"\t"+Polypeptide.three_to_one(poly[j].resname)) for angle in torsion_angles[0:2]: if angle == "-999": output.write("\t-999") else: output.write("\t{0:.2f}".format(angle)) chain_angle.append(torsion_angles) output.write("\n") output.close()
def get_chain_sequence(self, chain): return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in chain if x.resname in self.residue_list ])