def side_chain_placement(ag_to_place, current_reference_ag, rotamer_manager): """ Works with poly_gly truncated hierarchy. Also used in fix_rama_outliers. """ resname = current_reference_ag.resname.upper() c = one_three.get(resname, None) # seems to work with unusual residues... # if c is None: # msg = "Only standard protein residues are currently supported.\n" # msg += "The residue %s (chain %s, resid %s) chain is not standard." % ( # resname, # current_reference_ag.parent().parent().id, # current_reference_ag.parent().resid()) # raise Sorry(msg) ag_to_place.resname = three_one.get(c,resname) if c == 'G': return # align residue from ideal_res_dict to just placed ALA (ag_to_place) # or from pdb_hierarchy_template fixed_sites = flex.vec3_double() moving_sites = flex.vec3_double() reper_atoms = ["C","CA", "N"] for (ag, arr) in [(ag_to_place, fixed_sites), (current_reference_ag, moving_sites)]: for a in ag.atoms(): if a.name.strip() in reper_atoms: arr.append(a.xyz) assert len(fixed_sites) == 3 if len(moving_sites) < 3: error_msg = "C, CA or N atoms are absent in secondary structure element." +\ "\nPlease add them to the model and try again." raise Sorry(error_msg) assert len(moving_sites) == 3 lsq_fit_obj = superpose.least_squares_fit(reference_sites = fixed_sites, other_sites = moving_sites) ideal_correct_ag = current_reference_ag.detached_copy() ideal_correct_ag.atoms().set_xyz( lsq_fit_obj.r.elems*ideal_correct_ag.atoms().extract_xyz()+\ lsq_fit_obj.t.elems) ideal_correct_ag.atoms().set_xyz( rotamer_manager.nearest_rotamer_sites_cart(ideal_correct_ag)) if len(ideal_correct_ag.atoms()) > 4: ag_to_place.pre_allocate_atoms(number_of_additional_atoms=\ len(ideal_correct_ag.atoms())-4) for a in ideal_correct_ag.atoms(): if a.name.strip() not in ["N","CA","C","O"]: at = a.detached_copy() at.uij_erase() ag_to_place.append_atom(atom=at) else: # This means something wrong with input model, e.g. only 3 atoms in # the residue and they happened to be N, CA, C pass
def correct_sequence(pdb_hierarchy, sequences, truncate_to_cbeta=False, out=sys.stdout): """ Modify the sequence for the pdb hierarchy to match that of the aligned sequence. This will remove incompatible atoms; the sidechains will still need to be extended separated. For proteins only - mismatches in nucleic acids will only result in a warning. :param pdb_hierarchy: iotbx.pdb.hierarchy.root object :param sequences: list of iotbx.bioinformatics.sequence objects :param trucate_to_cbeta: chop off entire sidechain to C-beta (default: leave common atoms in place) :param out: output filehandle (default = stdout) :returns: number of atom_group objects renamed """ from mmtbx.monomer_library import idealized_aa import mmtbx.validation.sequence from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter seq_validation = mmtbx.validation.sequence.validation( pdb_hierarchy=pdb_hierarchy, sequences=sequences, log=out) for chain_seq in seq_validation.chains: if (chain_seq.chain_type == mmtbx.validation.sequence.NUCLEIC_ACID): if (len(chain_seq.mismatch) > 0): print(" WARNING: will skip %d mismatches in nucleic acid chain '%s'" % \ chain_seq.chain_id, file=out) res_dict = idealized_aa.residue_dict() expected_names = {} for resname in res_dict.keys(): if (not "_h" in resname): ideal_res = res_dict[resname] expected_names[resname] = set([a.name for a in ideal_res.atoms()]) n_changed = 0 for chain in pdb_hierarchy.only_model().chains(): if (not chain.is_protein()): continue for chain_seq in seq_validation.chains: if (chain.id == chain_seq.chain_id) and (len(chain_seq.mismatch) > 0): for residue_group in chain.residue_groups(): resid = residue_group.resid() if (resid in chain_seq.mismatch): idx = chain_seq.mismatch.index(resid) new_code = chain_seq.actual_code[idx] new_resname = three_letter_given_one_letter.get( new_code) if (new_resname is not None): expected_atoms = expected_names[ new_resname.lower()] if (truncate_to_cbeta): expected_atoms = expected_names["ala"] for atom_group in residue_group.atom_groups(): n_changed += 1 n_removed = 0 atom_group.resname = new_resname for atom in atom_group.atoms(): if (not atom.name in expected_atoms): atom_group.remove_atom(atom) n_removed += 1 print(" chain '%s' %s %s --> %s (%d atoms removed)" % \ (chain.id, resid, residue_group.atom_groups()[0].resname, new_resname, n_removed), file=out) pdb_hierarchy.atoms().reset_i_seq() return n_changed
def get_aa_parent(code): one = modified_aa_names.lookup.get(code.upper(), False) if not one: return code return three_letter_given_one_letter.get(one, None)
def exercise_pdb_hierarchy_sequence_as_cif_block(): pdb_atom_site_loop_header = """\ data_mmcif loop_ _atom_site.group_PDB _atom_site.id _atom_site.type_symbol _atom_site.label_atom_id _atom_site.label_alt_id _atom_site.label_comp_id _atom_site.label_asym_id _atom_site.label_entity_id _atom_site.label_seq_id _atom_site.pdbx_PDB_ins_code _atom_site.Cartn_x _atom_site.Cartn_y _atom_site.Cartn_z _atom_site.occupancy _atom_site.B_iso_or_equiv _atom_site.Cartn_x_esd _atom_site.Cartn_y_esd _atom_site.Cartn_z_esd _atom_site.occupancy_esd _atom_site.B_iso_or_equiv_esd _atom_site.pdbx_formal_charge _atom_site.auth_seq_id _atom_site.auth_comp_id _atom_site.auth_asym_id _atom_site.auth_atom_id _atom_site.pdbx_PDB_model_num """ # simple example with multiple copies of chain input_4ehz = """\ ATOM 2 C CA . GLU A 1 6 ? -35.647 65.380 -11.775 1.00 65.78 ? ? ? ? ? ? 858 GLU A CA 1 ATOM 11 C CA . LYS A 1 7 ? -34.996 68.963 -10.712 1.00 89.52 ? ? ? ? ? ? 859 LYS A CA 1 ATOM 20 C CA . LYS A 1 8 ? -31.415 68.325 -9.529 1.00 98.54 ? ? ? ? ? ? 860 LYS A CA 1 ATOM 29 C CA . PRO A 1 9 ? -29.858 70.569 -6.813 1.00 103.45 ? ? ? ? ? ? 861 PRO A CA 1 ATOM 36 C CA . ALA A 1 10 ? -26.545 72.463 -7.079 1.00 98.87 ? ? ? ? ? ? 862 ALA A CA 1 ATOM 41 C CA . THR A 1 11 ? -23.410 70.412 -7.767 1.00 90.75 ? ? ? ? ? ? 863 THR A CA 1 ATOM 48 C CA . GLU A 1 12 ? -21.306 71.534 -4.804 1.00 75.15 ? ? ? ? ? ? 864 GLU A CA 1 ATOM 57 C CA . VAL A 1 13 ? -17.543 70.954 -4.809 1.00 49.52 ? ? ? ? ? ? 865 VAL A CA 1 ATOM 64 C CA . ASP A 1 14 ? -16.048 68.671 -2.185 1.00 26.98 ? ? ? ? ? ? 866 ASP A CA 1 ATOM 72 C CA . PRO A 1 15 ? -12.276 69.450 -2.061 1.00 27.34 ? ? ? ? ? ? 867 PRO A CA 1 ATOM 79 C CA . THR A 1 16 ? -11.669 65.942 -0.699 1.00 23.73 ? ? ? ? ? ? 868 THR A CA 1 ATOM 86 C CA . HIS A 1 17 ? -13.266 64.157 -3.671 1.00 23.80 ? ? ? ? ? ? 869 HIS A CA 1 ATOM 96 C CA . PHE A 1 18 ? -10.664 63.252 -6.277 1.00 14.88 ? ? ? ? ? ? 870 PHE A CA 1 ATOM 107 C CA . GLU A 1 19 ? -12.022 62.182 -9.666 1.00 23.47 ? ? ? ? ? ? 871 GLU A CA 1 ATOM 116 C CA . LYS A 1 20 ? -10.351 59.111 -11.117 1.00 17.57 ? ? ? ? ? ? 872 LYS A CA 1 ATOM 125 C CA . ARG A 1 21 ? -10.204 60.546 -14.661 1.00 19.09 ? ? ? ? ? ? 873 ARG A CA 1 ATOM 136 C CA . PHE A 1 22 ? -7.912 63.384 -13.545 1.00 22.03 ? ? ? ? ? ? 874 PHE A CA 1 ATOM 147 C CA . LEU A 1 23 ? -5.613 61.332 -11.271 1.00 18.20 ? ? ? ? ? ? 875 LEU A CA 1 ATOM 155 C CA . LYS A 1 24 ? -2.583 60.745 -13.513 1.00 26.05 ? ? ? ? ? ? 876 LYS A CA 1 ATOM 2365 C CA . VAL B 1 13 ? 38.084 -8.470 -5.157 1.00 57.98 ? ? ? ? ? ? 865 VAL B CA 1 ATOM 2372 C CA . ASP B 1 14 ? 36.468 -6.229 -2.536 1.00 51.96 ? ? ? ? ? ? 866 ASP B CA 1 ATOM 2380 C CA . PRO B 1 15 ? 32.749 -7.130 -2.340 1.00 48.96 ? ? ? ? ? ? 867 PRO B CA 1 ATOM 2387 C CA . THR B 1 16 ? 31.935 -3.705 -0.847 1.00 26.72 ? ? ? ? ? ? 868 THR B CA 1 ATOM 2394 C CA . HIS B 1 17 ? 33.519 -1.814 -3.754 1.00 33.15 ? ? ? ? ? ? 869 HIS B CA 1 ATOM 2404 C CA . PHE B 1 18 ? 31.094 -0.811 -6.488 1.00 26.55 ? ? ? ? ? ? 870 PHE B CA 1 ATOM 2415 C CA . GLU B 1 19 ? 32.359 0.467 -9.861 1.00 38.45 ? ? ? ? ? ? 871 GLU B CA 1 ATOM 2424 C CA . LYS B 1 20 ? 30.409 3.510 -11.036 1.00 33.69 ? ? ? ? ? ? 872 LYS B CA 1 ATOM 2433 C CA . ARG B 1 21 ? 30.400 2.430 -14.663 1.00 36.58 ? ? ? ? ? ? 873 ARG B CA 1 ATOM 2444 C CA . PHE B 1 22 ? 28.294 -0.647 -13.791 1.00 38.39 ? ? ? ? ? ? 874 PHE B CA 1 ATOM 2455 C CA . LEU B 1 23 ? 25.763 1.275 -11.703 1.00 32.87 ? ? ? ? ? ? 875 LEU B CA 1 ATOM 2463 C CA . LYS B 1 24 ? 22.588 1.723 -13.713 1.00 30.22 ? ? ? ? ? ? 876 LYS B CA 1 """ import iotbx.bioinformatics from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter from cctbx.array_family import flex sequence_4ehz = iotbx.bioinformatics.sequence( "GDIVSEKKPATEVDPTHFEKRFLK") #RIRDLGEGHF" pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header + input_4ehz).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_4ehz]) cif_block = model._sequence_validation.sequence_as_cif_block() sequence = ';' + sequence_4ehz.sequence + '\n;' assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][ 0] == sequence assert cif_block['_entity_poly.pdbx_strand_id'] == 'A,B' assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), list(range(1, 25))) assert cif_block['_entity_poly_seq.entity_id'].all_eq('1') assert list(cif_block['_entity_poly_seq.mon_id']) == [ three_letter_given_one_letter.get(i) for i in sequence_4ehz.sequence ] # # example with modified amino acid - PTR input_3zdi = """\ ATOM 1422 C CA . ASN A 1 179 ? -11.025 -26.833 -3.747 1.00 86.68 ? ? ? ? ? ? 213 ASN A CA 1 ATOM 1430 C CA . VAL A 1 180 ? -7.831 -26.493 -1.696 1.00 82.40 ? ? ? ? ? ? 214 VAL A CA 1 ATOM 1437 C CA . SER A 1 181 ? -8.142 -28.602 1.444 1.00 89.69 ? ? ? ? ? ? 215 SER A CA 1 ATOM 1443 C CA . PTR A 1 182 ? -5.406 -26.622 3.177 1.00 88.05 ? ? ? ? ? ? 216 PTR A CA 1 ATOM 1459 C CA . ILE A 1 183 ? -7.514 -23.621 4.117 1.00 83.90 ? ? ? ? ? ? 217 ILE A CA 1 ATOM 1467 C CA . CYS A 1 184 ? -8.907 -21.533 7.009 1.00 86.39 ? ? ? ? ? ? 218 CYS A CA 1 ATOM 1473 C CA . SER A 1 185 ? -6.795 -21.356 10.148 1.00 91.03 ? ? ? ? ? ? 219 SER A CA 1 """ sequence_3zdi = iotbx.bioinformatics.sequence("NVSYICSR") pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header + input_3zdi).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_3zdi]) cif_block = model._sequence_validation.sequence_as_cif_block() assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == \ ';NVS(PTR)ICSR\n;' assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == \ ';' + sequence_3zdi.sequence + '\n;' assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), list(range(1, 9))) assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'ASN', 'VAL', 'SER', 'PTR', 'ILE', 'CYS', 'SER', 'ARG' ] # input_4gln = """\ ATOM 2 C CA . DTH A 1 1 ? -2.916 5.861 2.629 1.00 16.39 ? ? ? ? ? ? 1 DTH D CA 1 ATOM 9 C CA . DTY A 1 2 ? 0.533 4.844 3.866 1.00 10.74 ? ? ? ? ? ? 2 DTY D CA 1 ATOM 21 C CA . DLY A 1 3 ? 3.161 3.111 1.736 1.00 8.24 ? ? ? ? ? ? 3 DLY D CA 1 ATOM 30 C CA . DLE A 1 4 ? 6.958 3.293 1.625 1.00 7.95 ? ? ? ? ? ? 4 DLE D CA 1 ATOM 38 C CA . DIL A 1 5 ? 9.053 0.443 0.257 1.00 8.44 ? ? ? ? ? ? 5 DIL D CA 1 ATOM 46 C CA . DLE A 1 6 ? 12.622 1.402 -0.674 1.00 8.62 ? ? ? ? ? ? 6 DLE D CA 1 ATOM 54 C CA A DSG A 1 7 ? 14.930 -1.609 -0.756 0.60 11.27 ? ? ? ? ? ? 7 DSG D CA 1 ATOM 55 C CA B DSG A 1 7 ? 14.934 -1.617 -0.732 0.40 11.77 ? ? ? ? ? ? 7 DSG D CA 1 ATOM 67 C CA . GLY A 1 8 ? 18.113 -0.249 -2.284 1.00 13.02 ? ? ? ? ? ? 8 GLY D CA 1 ATOM 71 C CA . DLY A 1 9 ? 21.326 -1.954 -3.288 1.00 17.83 ? ? ? ? ? ? 9 DLY D CA 1 ATOM 80 C CA . DTH A 1 10 ? 20.765 -0.934 -6.926 1.00 16.38 ? ? ? ? ? ? 10 DTH D CA 1 # ATOM 472 C CA . GLU B 2 6 ? 15.798 -6.874 23.843 1.00 31.74 ? ? ? ? ? ? 6 GLU E CA 1 ATOM 477 C CA . VAL B 2 7 ? 16.644 -3.926 21.599 1.00 15.99 ? ? ? ? ? ? 7 VAL E CA 1 ATOM 484 C CA . VAL B 2 8 ? 13.767 -1.465 21.234 1.00 10.37 ? ? ? ? ? ? 8 VAL E CA 1 ATOM 491 C CA . LYS B 2 9 ? 12.953 -1.088 17.521 1.00 8.44 ? ? ? ? ? ? 9 LYS E CA 1 # HETATM 2537 O O . HOH E 3 . ? 8.196 -3.708 8.277 1.00 15.02 ? ? ? ? ? ? 101 HOH D O 1 HETATM 2538 O O . HOH E 3 . ? 4.901 -4.298 5.515 1.00 13.08 ? ? ? ? ? ? 102 HOH D O 1 HETATM 2663 O O . HOH F 3 . ? 10.535 -2.721 20.049 1.00 15.44 ? ? ? ? ? ? 201 HOH E O 1 HETATM 2664 O O . HOH F 3 . ? 0.790 8.695 30.909 1.00 17.06 ? ? ? ? ? ? 202 HOH E O 1 HETATM 2795 O O . HOH G 3 . ? 11.265 2.914 43.878 1.00 13.92 ? ? ? ? ? ? 201 HOH F O 1 HETATM 2796 O O . HOH G 3 . ? 11.197 11.667 36.108 1.00 17.00 ? ? ? ? ? ? 202 HOH F O 1 """ sequence_4gln = [ iotbx.bioinformatics.sequence("TYKLILNGKT"), iotbx.bioinformatics.sequence("GQNHHEVVK") ] pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header + input_4gln).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences(sequence_4gln) cif_block = model._sequence_validation.sequence_as_cif_block() assert list(cif_block['_entity.id']) == ['1', '2'] assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), list(range(1, 11)) + list(range(1, 10))) assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'DTH', 'DTY', 'DLY', 'DLE', 'DIL', 'DLE', 'DSG', 'GLY', 'DLY', 'DTH', 'GLY', 'GLN', 'ASN', 'HIS', 'HIS', 'GLU', 'VAL', 'VAL', 'LYS' ] assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code']) == [ ';(DTH)(DTY)(DLY)(DLE)(DIL)(DLE)(DSG)G(DLY)(DTH)\n;', ';' + sequence_4gln[1].sequence + '\n;' ] assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code_can']) == [ ';' + sequence_4gln[0].sequence + '\n;', ';' + sequence_4gln[1].sequence + '\n;' ] # input_1ezu = """\ ATOM 3971 C CA . VAL D 2 16 ? 24.971 -4.493 -3.652 1.00 33.12 ? ? ? ? ? ? 731 VAL D CA 1 ATOM 3978 C CA . SER D 2 17 ? 27.194 -3.056 -0.946 1.00 35.47 ? ? ? ? ? ? 732 SER D CA 1 ATOM 3984 C CA . LEU D 2 18 ? 26.541 0.123 0.961 1.00 45.29 ? ? ? ? ? ? 733 LEU D CA 1 ATOM 3992 C CA . ASN D 2 19 ? 29.777 2.032 1.598 1.00 53.09 ? ? ? ? ? ? 734 ASN D CA 1 ATOM 4000 C CA . SER D 2 20 ? 30.737 4.963 3.775 1.00 61.92 ? ? ? ? ? ? 737 SER D CA 1 ATOM 4006 C CA . GLY D 2 21 ? 34.478 4.622 4.207 1.00 62.21 ? ? ? ? ? ? 738 GLY D CA 1 ATOM 4010 C CA . TYR D 2 22 ? 33.903 0.885 4.483 1.00 54.81 ? ? ? ? ? ? 739 TYR D CA 1 """ sequence_1ezu = iotbx.bioinformatics.sequence('VSLNSGY') pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header + input_1ezu).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_1ezu]) cif_block = model._sequence_validation.sequence_as_cif_block() assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'VAL', 'SER', 'LEU', 'ASN', 'SER', 'GLY', 'TYR' ] assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == \ ';' + sequence_1ezu.sequence + '\n;' assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == \ ';' + sequence_1ezu.sequence + '\n;' input_2hok = """\ ATOM 301 P P . C A 1 15 ? 15.802 44.045 80.094 1.00 59.36 ? ? ? ? ? ? 23 C A P 1 ATOM 321 P P . C A 1 16 ? 12.286 47.301 82.617 1.00 68.27 ? ? ? ? ? ? 24 C A P 1 ATOM 341 P P . U A 1 17 ? 6.815 51.648 82.739 1.00 78.03 ? ? ? ? ? ? 25 U A P 1 ATOM 361 P P . G A 1 21 ? 7.042 52.289 91.645 1.00 96.25 ? ? ? ? ? ? 29 G A P 1 ATOM 384 P P . C A 1 22 ? 7.024 46.751 90.841 1.00 84.69 ? ? ? ? ? ? 30 C A P 1 ATOM 404 P P . G A 1 23 ? 7.477 40.933 88.377 1.00 81.65 ? ? ? ? ? ? 31 G A P 1 """ sequence_2hok = iotbx.bioinformatics.sequence("CCUUCUGCG") pdb_in = iotbx.pdb.input(lines=(pdb_atom_site_loop_header + input_2hok).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_2hok]) cif_block = model._sequence_validation.sequence_as_cif_block() assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'C', 'C', 'U', 'U', 'C', 'U', 'G', 'C', 'G' ] assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == \ ';' + sequence_2hok.sequence + '\n;' assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == \ ';' + sequence_2hok.sequence + '\n;' # input_3tpy = """\ ATOM 2 CA GLN A 24 2.586 40.220 34.036 1.00 41.54 C ATOM 8 CA LYS A 25 1.265 43.698 34.904 1.00 25.47 C ATOM 17 CA GLN A 26 3.834 45.984 36.538 1.00 22.91 C ATOM 26 CA PRO A 27 2.835 48.614 39.135 1.00 19.20 C ATOM 33 CA ILE A 28 3.972 52.206 39.293 1.00 18.70 C ATOM 41 CA SER A 29 6.403 51.332 42.097 1.00 22.63 C TER HETATM 852 MG MG A 999 -12.415 61.451 32.421 0.70 28.10 MG HETATM 853 C TRS A 153 -0.078 70.151 24.773 0.33 24.86 C HETATM 877 PA BDUP A 777 -9.339 60.563 31.137 0.70 19.64 P HETATM 881 PB BDUP A 777 -11.768 59.969 29.491 0.70 27.76 P HETATM 885 PG BDUP A 777 -13.098 58.529 31.620 0.70 33.91 P HETATM 905 P AUMP A 154 -9.010 60.358 31.334 0.30 11.42 P HETATM 909 O HOH A 155 -0.197 60.723 27.343 1.00 17.17 O HETATM 910 O HOH A 156 -10.293 62.567 35.648 1.00 19.43 O """ sequence_3tpy = iotbx.bioinformatics.sequence("QKQPIS") pdb_in = iotbx.pdb.input(lines=(input_3tpy).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_3tpy]) cif_block = model.get_hierarchy().as_cif_block() assert list(cif_block["_atom_site.label_seq_id"]) == [ '1', '2', '3', '4', '5', '6', '.', '.', '.', '.', '.', '.', '.', '.' ] # input_3tgr = """\ ATOM 2449 CA GLY A 459 -17.536 10.137 41.979 1.00181.52 C ATOM 2453 CA GLN A 460 -15.862 12.780 44.128 1.00192.51 C ATOM 2462 CA ASN A 463 -19.198 8.054 50.455 1.00180.96 C ATOM 2470 CA ASP A 464 -19.235 4.661 52.197 1.00143.07 C ATOM 2478 CA THR A 465 -20.893 2.988 49.198 1.00 91.96 C """ sequence_3tgr = iotbx.bioinformatics.sequence("DGGQSNETNDTET") pdb_in = iotbx.pdb.input(lines=(input_3tgr).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_3tgr]) cif_block = model._sequence_validation.sequence_as_cif_block() assert cif_block["_entity_poly.pdbx_seq_one_letter_code"][0] == \ ';DGGQSNETNDNET\n;' input_2im9 = """\ ATOM 2423 CA PRO A 345 2.114 16.158 0.161 1.00 29.14 C ATOM 2430 CA VAL A 346 -1.223 17.837 0.938 1.00 31.05 C ATOM 2437 CA CYS A 349 -4.081 15.852 7.014 0.50 28.57 C ATOM 2443 CA GLN A 350 -6.176 14.041 9.639 0.50 30.62 C ATOM 2452 CA LEU A 351 -6.631 10.729 7.797 0.50 31.53 C ATOM 2460 CA PHE A 352 -5.220 9.172 4.620 0.50 31.95 C """ sequence_2im9 = iotbx.bioinformatics.sequence( "SSPTIKGINIQVVLPEKPVSNGCQLFDIR") pdb_in = iotbx.pdb.input(lines=(input_2im9).splitlines(), source_info=None) model = mmtbx.model.manager(pdb_in) model.set_sequences([sequence_2im9]) cif_block = model._sequence_validation.sequence_as_cif_block() assert list(cif_block["_entity_poly_seq.mon_id"]) == [ 'SER', 'SER', 'PRO', 'THR', 'ILE', 'LYS', 'GLY', 'ILE', 'ASN', 'ILE', 'GLN', 'VAL', 'VAL', 'LEU', 'PRO', 'GLU', 'LYS', 'PRO', 'VAL', 'SER', 'ASN', 'GLY', 'CYS', 'CYS', 'GLN', 'LEU', 'ASP', 'ILE', 'ARG' ]
def sequence_as_cif_block(self, custom_residues=None): """ Export sequence information as mmCIF block Version 5.0 of mmCIF/PDBx dictionary Parameters ---------- custom_residues: list of str List of custom 3-letter residues to keep in pdbx_one_letter_sequence The 3-letter residue must exist in the model. If None, the value from self.custom_residues is used. Returns ------- cif_block: iotbx.cif.model.block """ if custom_residues is None: custom_residues = self.custom_residues dna = set(['DA', 'DT', 'DC', 'DG', 'DI']) rna = set(['A', 'U', 'C', 'G']) rna_to_dna = {'A': 'DA', 'U': 'DT', 'T': 'DT', 'C': 'DC', 'G': 'DG', 'I': 'DI'} modified_dna = set() modified_rna = set() for key in modified_rna_dna_names.lookup.keys(): value = modified_rna_dna_names.lookup[key] if value in dna: modified_dna.add(key) elif value in rna: modified_rna.add(key) # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity.html entity_loop = iotbx.cif.model.loop(header=( '_entity.id', '_entity.pdbx_description' )) # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly.html entity_poly_loop = iotbx.cif.model.loop(header=( '_entity_poly.entity_id', '_entity_poly.nstd_linkage', '_entity_poly.nstd_monomer', '_entity_poly.pdbx_seq_one_letter_code', '_entity_poly.pdbx_seq_one_letter_code_can', '_entity_poly.pdbx_strand_id', '_entity_poly.pdbx_target_identifier', '_entity_poly.type', )) # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly_seq.html entity_poly_seq_loop = iotbx.cif.model.loop(header=( '_entity_poly_seq.entity_id', '_entity_poly_seq.num', '_entity_poly_seq.mon_id', '_entity_poly_seq.hetero', )) # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref.html struct_ref_loop = iotbx.cif.model.loop(header=( '_struct_ref.id', '_struct_ref.db_code', '_struct_ref.db_name', '_struct_ref.entity_id', '_struct_ref.pdbx_align_begin', '_struct_ref.pdbx_db_accession', '_struct_ref.pdbx_db_isoform', '_struct_ref.pdbx_seq_one_letter_code', )) # http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/struct_ref_seq.html struct_ref_seq_loop = iotbx.cif.model.loop(header=( '_struct_ref_seq.align_id', '_struct_ref_seq.db_align_beg', '_struct_ref_seq.db_align_end', '_struct_ref_seq.pdbx_PDB_id_code', '_struct_ref_seq.pdbx_auth_seq_align_beg', '_struct_ref_seq.pdbx_auth_seq_align_end', '_struct_ref_seq.pdbx_db_accession', '_struct_ref_seq.pdbx_db_align_beg_ins_code', '_struct_ref_seq.pdbx_db_align_end_ins_code', '_struct_ref_seq.pdbx_seq_align_beg_ins_code', '_struct_ref_seq.pdbx_seq_align_end_ins_code', '_struct_ref_seq.pdbx_strand_id', '_struct_ref_seq.ref_id', '_struct_ref_seq.seq_align_beg', '_struct_ref_seq.seq_align_end', )) entity_id = 0 # entity_poly sequence_to_entity_id = dict() nstd_linkage = dict() nstd_monomer = dict() seq_one_letter_code = dict() seq_one_letter_code_can = dict() strand_id = dict() target_identifier = dict() sequence_type = dict() # entity_poly_seq num = dict() mon_id = dict() hetero = dict() # struct_ref (work in progress) chain_id = dict() db_code = '?' db_name = '?' align_begin = '?' db_accession = '?' db_isoform = '?' # struct_ref_seq (work in progress) db_align_beg = '?' db_align_end = '?' PDB_id_code = '?' align_beg_ins_code = '?' align_end_ins_code = '?' for i_chain, chain in enumerate(self.chains): seq_can = chain.alignment.b # entity_id if seq_can not in sequence_to_entity_id: entity_id += 1 sequence_to_entity_id[seq_can] = entity_id else: # subsequent matches just add strand_id entity_id = sequence_to_entity_id[seq_can] strand_id[entity_id].append(chain.chain_id) continue # entity_poly items # nstd_linkage (work in progress) if entity_id not in nstd_linkage: nstd_linkage[entity_id] = 'no' # nstd_monomer if entity_id not in nstd_monomer: nstd_monomer[entity_id] = 'no' # pdbx_seq_one_letter_code if entity_id not in seq_one_letter_code: seq_one_letter_code[entity_id] = list() # type (work in progress) if entity_id not in sequence_type: sequence_type[entity_id] = '?' has_protein = False has_rna = False has_dna = False has_sugar = False is_d = False # chain.alignment.a is the model # chain.alignment.b is the sequence for i_a, i_b in zip(chain.alignment.i_seqs_a, chain.alignment.i_seqs_b): # sequence does not have residue in model if i_b is None: continue # model does not have residue in sequence if i_a is None or chain.resnames[i_a] is None: letter = seq_can[i_b] else: resname = chain.resnames[i_a].strip() # check for modified residues if (resname in modified_aa_names.lookup or resname in modified_rna_dna_names.lookup or resname in custom_residues): letter = '({resname})'.format(resname=resname) nstd_monomer[entity_id] = 'yes' elif resname in three_letter_l_given_three_letter_d: letter = '({resname})'.format(resname=resname) nstd_monomer[entity_id] = 'yes' # check for nucleic acid elif resname in dna: letter = '({resname})'.format(resname=resname) elif resname in rna: letter = resname # regular protein else: letter = one_letter_given_three_letter.get(resname) if letter is None: letter = 'X' # check for protein if (resname in one_letter_given_three_letter or resname in modified_aa_names.lookup): has_protein = True # check for DNA # hybrid protein/DNA/RNA chains are not allowed if resname in dna or resname in modified_dna: has_dna = True has_protein = False # check for RNA # does not handle hybrid DNA/RNA chains if resname in rna or resname in modified_rna: has_rna = True has_dna = False has_protein = False # check chirality # hybrid D/L handed chains are not allowed if resname in three_letter_l_given_three_letter_d: is_d = True # pdbx_seq_one_letter_code seq_one_letter_code[entity_id].append(letter) # pdbx_seq_one_letter_code_can seq_one_letter_code_can[entity_id] = seq_can.replace('-', '') # strand_id if entity_id not in strand_id: strand_id[entity_id] = list() strand_id[entity_id].append(chain.chain_id) # target_identifier (work in progress) if entity_id not in target_identifier: target_identifier[entity_id] = '?' # type # polypeptide(L) # polypeptide(D) # polydeoxyribonucleotide, # polyribonucleotide # missing # cyclic-psuedo-peptide # other # peptide nucleic acid # polydeoxyribonucleotide/polyribonucleotide # polysaccharide(D) # polysaccahride(L) if has_protein: choice = 'polypeptide' if is_d: choice += '(D)' else: choice += '(L)' if has_dna: choice = 'polydeoxyribonucleotide' if has_rna: choice = 'polyribonucleotide' sequence_type[entity_id] = choice # entity_poly_seq items if entity_id not in mon_id: mon_id[entity_id] = list() if entity_id not in num: num[entity_id] = list() if entity_id not in hetero: hetero[entity_id] = list() # struct_ref items if entity_id not in chain_id: chain_id[entity_id] = i_chain + 1 for i_a, i_b in zip(chain.alignment.i_seqs_a, chain.alignment.i_seqs_b): # sequence does not have residue in model if i_b is None: continue seq_resname = None if has_protein: seq_resname = three_letter_given_one_letter.get(seq_can[i_b]) if has_dna: seq_resname = rna_to_dna.get(seq_can[i_b]) if has_rna: seq_resname = seq_can[i_b] if seq_resname is None: seq_resname = 'UNK' # model does not have residue in sequence if i_a is None or chain.resnames[i_a] is None: resname = seq_resname else: resname = chain.resnames[i_a] mon_id[entity_id].append(resname.strip()) if len(num[entity_id]) == 0: num[entity_id].append(1) else: num[entity_id].append(num[entity_id][-1] + 1) hetero[entity_id].append('no') # build loops ids = list(sequence_to_entity_id.values()) ids.sort() align_id = 1 for entity_id in ids: # construct entity_poly loop if len(strand_id[entity_id]) == 1: chains = strand_id[entity_id][0] else: chains = strand_id[entity_id] #chains.sort() chains = ','.join(chains) entity_poly_loop.add_row(( entity_id, nstd_linkage[entity_id], nstd_monomer[entity_id], ';' + ''.join(seq_one_letter_code[entity_id]) + '\n;', ';' + seq_one_letter_code_can[entity_id] + '\n;', chains, target_identifier[entity_id], sequence_type[entity_id] )) # construct entity loop entity_loop.add_row(( entity_id, 'Chains: ' + chains )) # construct entity_poly_seq loop chain_length = len(mon_id[entity_id]) for i in range(chain_length): entity_poly_seq_loop.add_row(( entity_id, num[entity_id][i], mon_id[entity_id][i], hetero[entity_id][i] )) # construct struct_ref loop struct_ref_loop.add_row(( chain_id[entity_id], db_code, db_name, entity_id, align_begin, db_accession, db_isoform, ';' + seq_one_letter_code_can[entity_id] + '\n;' )) # construct struct_ref_seq loop for chain in strand_id[entity_id]: struct_ref_seq_loop.add_row(( align_id, db_align_beg, db_align_end, PDB_id_code, '1', len(seq_one_letter_code_can[entity_id]) - 1, db_accession, align_beg_ins_code, align_end_ins_code, align_beg_ins_code, align_end_ins_code, chain, chain_id[entity_id], '1', len(seq_one_letter_code_can[entity_id]) - 1 )) # construct block cif_block = iotbx.cif.model.block() cif_block.add_loop(entity_loop) cif_block.add_loop(entity_poly_loop) cif_block.add_loop(entity_poly_seq_loop) cif_block.add_loop(struct_ref_loop) cif_block.add_loop(struct_ref_seq_loop) return cif_block
def exercise_pdb_hierarchy_sequence_as_cif_block(): pdb_atom_site_loop_header = """\ data_mmcif loop_ _atom_site.group_PDB _atom_site.id _atom_site.type_symbol _atom_site.label_atom_id _atom_site.label_alt_id _atom_site.label_comp_id _atom_site.label_asym_id _atom_site.label_entity_id _atom_site.label_seq_id _atom_site.pdbx_PDB_ins_code _atom_site.Cartn_x _atom_site.Cartn_y _atom_site.Cartn_z _atom_site.occupancy _atom_site.B_iso_or_equiv _atom_site.Cartn_x_esd _atom_site.Cartn_y_esd _atom_site.Cartn_z_esd _atom_site.occupancy_esd _atom_site.B_iso_or_equiv_esd _atom_site.pdbx_formal_charge _atom_site.auth_seq_id _atom_site.auth_comp_id _atom_site.auth_asym_id _atom_site.auth_atom_id _atom_site.pdbx_PDB_model_num """ # simple example with multiple copies of chain input_4ehz = """\ ATOM 2 C CA . GLU A 1 6 ? -35.647 65.380 -11.775 1.00 65.78 ? ? ? ? ? ? 858 GLU A CA 1 ATOM 11 C CA . LYS A 1 7 ? -34.996 68.963 -10.712 1.00 89.52 ? ? ? ? ? ? 859 LYS A CA 1 ATOM 20 C CA . LYS A 1 8 ? -31.415 68.325 -9.529 1.00 98.54 ? ? ? ? ? ? 860 LYS A CA 1 ATOM 29 C CA . PRO A 1 9 ? -29.858 70.569 -6.813 1.00 103.45 ? ? ? ? ? ? 861 PRO A CA 1 ATOM 36 C CA . ALA A 1 10 ? -26.545 72.463 -7.079 1.00 98.87 ? ? ? ? ? ? 862 ALA A CA 1 ATOM 41 C CA . THR A 1 11 ? -23.410 70.412 -7.767 1.00 90.75 ? ? ? ? ? ? 863 THR A CA 1 ATOM 48 C CA . GLU A 1 12 ? -21.306 71.534 -4.804 1.00 75.15 ? ? ? ? ? ? 864 GLU A CA 1 ATOM 57 C CA . VAL A 1 13 ? -17.543 70.954 -4.809 1.00 49.52 ? ? ? ? ? ? 865 VAL A CA 1 ATOM 64 C CA . ASP A 1 14 ? -16.048 68.671 -2.185 1.00 26.98 ? ? ? ? ? ? 866 ASP A CA 1 ATOM 72 C CA . PRO A 1 15 ? -12.276 69.450 -2.061 1.00 27.34 ? ? ? ? ? ? 867 PRO A CA 1 ATOM 79 C CA . THR A 1 16 ? -11.669 65.942 -0.699 1.00 23.73 ? ? ? ? ? ? 868 THR A CA 1 ATOM 86 C CA . HIS A 1 17 ? -13.266 64.157 -3.671 1.00 23.80 ? ? ? ? ? ? 869 HIS A CA 1 ATOM 96 C CA . PHE A 1 18 ? -10.664 63.252 -6.277 1.00 14.88 ? ? ? ? ? ? 870 PHE A CA 1 ATOM 107 C CA . GLU A 1 19 ? -12.022 62.182 -9.666 1.00 23.47 ? ? ? ? ? ? 871 GLU A CA 1 ATOM 116 C CA . LYS A 1 20 ? -10.351 59.111 -11.117 1.00 17.57 ? ? ? ? ? ? 872 LYS A CA 1 ATOM 125 C CA . ARG A 1 21 ? -10.204 60.546 -14.661 1.00 19.09 ? ? ? ? ? ? 873 ARG A CA 1 ATOM 136 C CA . PHE A 1 22 ? -7.912 63.384 -13.545 1.00 22.03 ? ? ? ? ? ? 874 PHE A CA 1 ATOM 147 C CA . LEU A 1 23 ? -5.613 61.332 -11.271 1.00 18.20 ? ? ? ? ? ? 875 LEU A CA 1 ATOM 155 C CA . LYS A 1 24 ? -2.583 60.745 -13.513 1.00 26.05 ? ? ? ? ? ? 876 LYS A CA 1 ATOM 2365 C CA . VAL B 1 13 ? 38.084 -8.470 -5.157 1.00 57.98 ? ? ? ? ? ? 865 VAL B CA 1 ATOM 2372 C CA . ASP B 1 14 ? 36.468 -6.229 -2.536 1.00 51.96 ? ? ? ? ? ? 866 ASP B CA 1 ATOM 2380 C CA . PRO B 1 15 ? 32.749 -7.130 -2.340 1.00 48.96 ? ? ? ? ? ? 867 PRO B CA 1 ATOM 2387 C CA . THR B 1 16 ? 31.935 -3.705 -0.847 1.00 26.72 ? ? ? ? ? ? 868 THR B CA 1 ATOM 2394 C CA . HIS B 1 17 ? 33.519 -1.814 -3.754 1.00 33.15 ? ? ? ? ? ? 869 HIS B CA 1 ATOM 2404 C CA . PHE B 1 18 ? 31.094 -0.811 -6.488 1.00 26.55 ? ? ? ? ? ? 870 PHE B CA 1 ATOM 2415 C CA . GLU B 1 19 ? 32.359 0.467 -9.861 1.00 38.45 ? ? ? ? ? ? 871 GLU B CA 1 ATOM 2424 C CA . LYS B 1 20 ? 30.409 3.510 -11.036 1.00 33.69 ? ? ? ? ? ? 872 LYS B CA 1 ATOM 2433 C CA . ARG B 1 21 ? 30.400 2.430 -14.663 1.00 36.58 ? ? ? ? ? ? 873 ARG B CA 1 ATOM 2444 C CA . PHE B 1 22 ? 28.294 -0.647 -13.791 1.00 38.39 ? ? ? ? ? ? 874 PHE B CA 1 ATOM 2455 C CA . LEU B 1 23 ? 25.763 1.275 -11.703 1.00 32.87 ? ? ? ? ? ? 875 LEU B CA 1 ATOM 2463 C CA . LYS B 1 24 ? 22.588 1.723 -13.713 1.00 30.22 ? ? ? ? ? ? 876 LYS B CA 1 """ import iotbx.bioinformatics from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter from cctbx.array_family import flex sequence_4ehz = iotbx.bioinformatics.sequence("GDIVSEKKPATEVDPTHFEKRFLK")#RIRDLGEGHF" pdb_in = iotbx.pdb.input( lines=(pdb_atom_site_loop_header+input_4ehz).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_4ehz], crystal_symmetry=pdb_in.crystal_symmetry()) assert cif_block['_entity.id'][0] == '1' assert cif_block['_entity.type'][0] == 'polymer' assert cif_block['_entity.pdbx_number_of_molecules'][0] == '2' assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence_4ehz.sequence assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_4ehz.sequence assert cif_block['_entity_poly.pdbx_strand_id'] == 'A,B' assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), range(1, 25)) assert cif_block['_entity_poly_seq.entity_id'].all_eq('1') assert list(cif_block['_entity_poly_seq.mon_id']) == [ three_letter_given_one_letter.get(i) for i in sequence_4ehz.sequence] # # example with modified amino acid - PTR input_3zdi = """\ ATOM 1422 C CA . ASN A 1 179 ? -11.025 -26.833 -3.747 1.00 86.68 ? ? ? ? ? ? 213 ASN A CA 1 ATOM 1430 C CA . VAL A 1 180 ? -7.831 -26.493 -1.696 1.00 82.40 ? ? ? ? ? ? 214 VAL A CA 1 ATOM 1437 C CA . SER A 1 181 ? -8.142 -28.602 1.444 1.00 89.69 ? ? ? ? ? ? 215 SER A CA 1 ATOM 1443 C CA . PTR A 1 182 ? -5.406 -26.622 3.177 1.00 88.05 ? ? ? ? ? ? 216 PTR A CA 1 ATOM 1459 C CA . ILE A 1 183 ? -7.514 -23.621 4.117 1.00 83.90 ? ? ? ? ? ? 217 ILE A CA 1 ATOM 1467 C CA . CYS A 1 184 ? -8.907 -21.533 7.009 1.00 86.39 ? ? ? ? ? ? 218 CYS A CA 1 ATOM 1473 C CA . SER A 1 185 ? -6.795 -21.356 10.148 1.00 91.03 ? ? ? ? ? ? 219 SER A CA 1 """ sequence_3zdi = iotbx.bioinformatics.sequence("NVSYICSR") pdb_in = iotbx.pdb.input( lines=(pdb_atom_site_loop_header+input_3zdi).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_3zdi], crystal_symmetry=pdb_in.crystal_symmetry()) assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == 'NVS(PTR)ICSR' assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_3zdi.sequence assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), range(1, 9)) assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'ASN', 'VAL', 'SER', 'PTR', 'ILE', 'CYS', 'SER', 'ARG'] # input_4gln = """\ ATOM 2 C CA . DTH A 1 1 ? -2.916 5.861 2.629 1.00 16.39 ? ? ? ? ? ? 1 DTH D CA 1 ATOM 9 C CA . DTY A 1 2 ? 0.533 4.844 3.866 1.00 10.74 ? ? ? ? ? ? 2 DTY D CA 1 ATOM 21 C CA . DLY A 1 3 ? 3.161 3.111 1.736 1.00 8.24 ? ? ? ? ? ? 3 DLY D CA 1 ATOM 30 C CA . DLE A 1 4 ? 6.958 3.293 1.625 1.00 7.95 ? ? ? ? ? ? 4 DLE D CA 1 ATOM 38 C CA . DIL A 1 5 ? 9.053 0.443 0.257 1.00 8.44 ? ? ? ? ? ? 5 DIL D CA 1 ATOM 46 C CA . DLE A 1 6 ? 12.622 1.402 -0.674 1.00 8.62 ? ? ? ? ? ? 6 DLE D CA 1 ATOM 54 C CA A DSG A 1 7 ? 14.930 -1.609 -0.756 0.60 11.27 ? ? ? ? ? ? 7 DSG D CA 1 ATOM 55 C CA B DSG A 1 7 ? 14.934 -1.617 -0.732 0.40 11.77 ? ? ? ? ? ? 7 DSG D CA 1 ATOM 67 C CA . GLY A 1 8 ? 18.113 -0.249 -2.284 1.00 13.02 ? ? ? ? ? ? 8 GLY D CA 1 ATOM 71 C CA . DLY A 1 9 ? 21.326 -1.954 -3.288 1.00 17.83 ? ? ? ? ? ? 9 DLY D CA 1 ATOM 80 C CA . DTH A 1 10 ? 20.765 -0.934 -6.926 1.00 16.38 ? ? ? ? ? ? 10 DTH D CA 1 # ATOM 472 C CA . GLU B 2 6 ? 15.798 -6.874 23.843 1.00 31.74 ? ? ? ? ? ? 6 GLU E CA 1 ATOM 477 C CA . VAL B 2 7 ? 16.644 -3.926 21.599 1.00 15.99 ? ? ? ? ? ? 7 VAL E CA 1 ATOM 484 C CA . VAL B 2 8 ? 13.767 -1.465 21.234 1.00 10.37 ? ? ? ? ? ? 8 VAL E CA 1 ATOM 491 C CA . LYS B 2 9 ? 12.953 -1.088 17.521 1.00 8.44 ? ? ? ? ? ? 9 LYS E CA 1 # HETATM 2537 O O . HOH E 3 . ? 8.196 -3.708 8.277 1.00 15.02 ? ? ? ? ? ? 101 HOH D O 1 HETATM 2538 O O . HOH E 3 . ? 4.901 -4.298 5.515 1.00 13.08 ? ? ? ? ? ? 102 HOH D O 1 HETATM 2663 O O . HOH F 3 . ? 10.535 -2.721 20.049 1.00 15.44 ? ? ? ? ? ? 201 HOH E O 1 HETATM 2664 O O . HOH F 3 . ? 0.790 8.695 30.909 1.00 17.06 ? ? ? ? ? ? 202 HOH E O 1 HETATM 2795 O O . HOH G 3 . ? 11.265 2.914 43.878 1.00 13.92 ? ? ? ? ? ? 201 HOH F O 1 HETATM 2796 O O . HOH G 3 . ? 11.197 11.667 36.108 1.00 17.00 ? ? ? ? ? ? 202 HOH F O 1 """ sequence_4gln = [iotbx.bioinformatics.sequence("TYKLILNGKT"), iotbx.bioinformatics.sequence("GQNHHEVVK")] pdb_in = iotbx.pdb.input( lines=(pdb_atom_site_loop_header+input_4gln).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=sequence_4gln, crystal_symmetry=pdb_in.crystal_symmetry()) assert list(cif_block['_entity.id']) == ['1', '2', '3'] assert list(cif_block['_entity.type']) == ['polymer', 'polymer', 'water'] assert approx_equal(flex.int(cif_block['_entity_poly_seq.num']), range(1, 11)+range(1, 10)) assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'DTH', 'DTY', 'DLY', 'DLE', 'DIL', 'DLE', 'DSG', 'GLY', 'DLY', 'DTH', 'GLY', 'GLN', 'ASN', 'HIS', 'HIS', 'GLU', 'VAL', 'VAL', 'LYS'] assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code']) == [ '(DTH)(DTY)(DLY)(DLE)(DIL)(DLE)(DSG)G(DLY)(DTH)', sequence_4gln[1].sequence] assert list(cif_block['_entity_poly.pdbx_seq_one_letter_code_can']) == [ sequence_4gln[0].sequence, sequence_4gln[1].sequence] assert approx_equal(flex.int(cif_block['_atom_site.label_entity_id']), [1]*11 + [2]*4 + [3]*6) assert list(cif_block['_atom_site.label_seq_id']) == [ '1', '2', '3', '4', '5', '6', '7', '7', '8', '9', '10', '6', '7', '8', '9', '.', '.', '.', '.', '.', '.'] # input_1ezu = """\ ATOM 3971 C CA . VAL D 2 16 ? 24.971 -4.493 -3.652 1.00 33.12 ? ? ? ? ? ? 731 VAL D CA 1 ATOM 3978 C CA . SER D 2 17 ? 27.194 -3.056 -0.946 1.00 35.47 ? ? ? ? ? ? 732 SER D CA 1 ATOM 3984 C CA . LEU D 2 18 ? 26.541 0.123 0.961 1.00 45.29 ? ? ? ? ? ? 733 LEU D CA 1 ATOM 3992 C CA . ASN D 2 19 ? 29.777 2.032 1.598 1.00 53.09 ? ? ? ? ? ? 734 ASN D CA 1 ATOM 4000 C CA . SER D 2 20 ? 30.737 4.963 3.775 1.00 61.92 ? ? ? ? ? ? 737 SER D CA 1 ATOM 4006 C CA . GLY D 2 21 ? 34.478 4.622 4.207 1.00 62.21 ? ? ? ? ? ? 738 GLY D CA 1 ATOM 4010 C CA . TYR D 2 22 ? 33.903 0.885 4.483 1.00 54.81 ? ? ? ? ? ? 739 TYR D CA 1 """ sequence_1ezu = iotbx.bioinformatics.sequence('VSLNSGY') pdb_in = iotbx.pdb.input( lines=(pdb_atom_site_loop_header+input_1ezu).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_1ezu]) assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'VAL', 'SER', 'LEU', 'ASN', 'SER', 'GLY', 'TYR'] assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence_1ezu.sequence assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_1ezu.sequence assert list(cif_block['_atom_site.auth_seq_id']) == [ '731', '732', '733', '734', '737', '738', '739'] assert list(cif_block['_atom_site.label_seq_id']) == [ '1', '2', '3', '4', '5', '6', '7'] input_2hok = """\ ATOM 301 P P . C A 1 15 ? 15.802 44.045 80.094 1.00 59.36 ? ? ? ? ? ? 23 C A P 1 ATOM 321 P P . C A 1 16 ? 12.286 47.301 82.617 1.00 68.27 ? ? ? ? ? ? 24 C A P 1 ATOM 341 P P . U A 1 17 ? 6.815 51.648 82.739 1.00 78.03 ? ? ? ? ? ? 25 U A P 1 ATOM 361 P P . G A 1 21 ? 7.042 52.289 91.645 1.00 96.25 ? ? ? ? ? ? 29 G A P 1 ATOM 384 P P . C A 1 22 ? 7.024 46.751 90.841 1.00 84.69 ? ? ? ? ? ? 30 C A P 1 ATOM 404 P P . G A 1 23 ? 7.477 40.933 88.377 1.00 81.65 ? ? ? ? ? ? 31 G A P 1 """ sequence_2hok = iotbx.bioinformatics.sequence("CCUUCUGCG") pdb_in = iotbx.pdb.input( lines=(pdb_atom_site_loop_header+input_2hok).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_2hok]) assert list(cif_block['_entity_poly_seq.mon_id']) == [ 'C', 'C', 'U', 'U', 'C', 'U', 'G', 'C', 'G'] assert cif_block['_entity_poly.pdbx_seq_one_letter_code'][0] == sequence_2hok.sequence assert cif_block['_entity_poly.pdbx_seq_one_letter_code_can'][0] == sequence_2hok.sequence assert list(cif_block['_atom_site.auth_seq_id']) == [ '23', '24', '25', '29', '30', '31'] assert list(cif_block['_atom_site.label_seq_id']) == [ '1', '2', '3', '7', '8', '9'] # input_3tpy = """\ ATOM 2 CA GLN A 24 2.586 40.220 34.036 1.00 41.54 C ATOM 8 CA LYS A 25 1.265 43.698 34.904 1.00 25.47 C ATOM 17 CA GLN A 26 3.834 45.984 36.538 1.00 22.91 C ATOM 26 CA PRO A 27 2.835 48.614 39.135 1.00 19.20 C ATOM 33 CA ILE A 28 3.972 52.206 39.293 1.00 18.70 C ATOM 41 CA SER A 29 6.403 51.332 42.097 1.00 22.63 C TER HETATM 852 MG MG A 999 -12.415 61.451 32.421 0.70 28.10 MG HETATM 853 C TRS A 153 -0.078 70.151 24.773 0.33 24.86 C HETATM 877 PA BDUP A 777 -9.339 60.563 31.137 0.70 19.64 P HETATM 881 PB BDUP A 777 -11.768 59.969 29.491 0.70 27.76 P HETATM 885 PG BDUP A 777 -13.098 58.529 31.620 0.70 33.91 P HETATM 905 P AUMP A 154 -9.010 60.358 31.334 0.30 11.42 P HETATM 909 O HOH A 155 -0.197 60.723 27.343 1.00 17.17 O HETATM 910 O HOH A 156 -10.293 62.567 35.648 1.00 19.43 O """ sequence_3tpy = iotbx.bioinformatics.sequence("QKQPIS") pdb_in = iotbx.pdb.input(lines=(input_3tpy).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_3tpy]) assert list(cif_block["_entity.type"]) == [ 'polymer', 'non-polymer', 'non-polymer', 'non-polymer', 'non-polymer', 'water'] assert list(cif_block["_atom_site.label_entity_id"]) == [ '1', '1', '1', '1', '1', '1', '2', '3', '4', '4', '4', '5', '6', '6'] assert list(cif_block["_atom_site.label_seq_id"]) == [ '1', '2', '3', '4', '5', '6', '.', '.', '.', '.', '.', '.', '.', '.'] # input_3tgr = """\ ATOM 2449 CA GLY A 459 -17.536 10.137 41.979 1.00181.52 C ATOM 2453 CA GLN A 460 -15.862 12.780 44.128 1.00192.51 C ATOM 2462 CA ASN A 463 -19.198 8.054 50.455 1.00180.96 C ATOM 2470 CA ASP A 464 -19.235 4.661 52.197 1.00143.07 C ATOM 2478 CA THR A 465 -20.893 2.988 49.198 1.00 91.96 C """ sequence_3tgr = iotbx.bioinformatics.sequence("DGGQSNETNDTET") pdb_in = iotbx.pdb.input(lines=(input_3tgr).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_3tgr]) assert list(cif_block["_entity_poly_seq.mon_id"]) == [ 'ASP', 'GLY', 'GLY', 'GLN', 'SER', 'ASN', 'GLU', 'THR', 'ASN', 'ASP', 'THR', 'GLU', 'THR'] assert list(cif_block["_atom_site.label_comp_id"]) == [ 'GLY', 'GLN', 'ASN', 'ASP', 'THR'] assert list(cif_block["_atom_site.label_seq_id"]) == ['3', '4', '9', '10', '11'] assert cif_block["_entity_poly.pdbx_seq_one_letter_code"][0] == 'DGGQSNETNDTET' input_2im9 = """\ ATOM 2423 CA PRO A 345 2.114 16.158 0.161 1.00 29.14 C ATOM 2430 CA VAL A 346 -1.223 17.837 0.938 1.00 31.05 C ATOM 2437 CA CYS A 349 -4.081 15.852 7.014 0.50 28.57 C ATOM 2443 CA GLN A 350 -6.176 14.041 9.639 0.50 30.62 C ATOM 2452 CA LEU A 351 -6.631 10.729 7.797 0.50 31.53 C ATOM 2460 CA PHE A 352 -5.220 9.172 4.620 0.50 31.95 C """ sequence_2im9 = iotbx.bioinformatics.sequence("SSPTIKGINIQVVLPEKPVSNGCQLFDIR") pdb_in = iotbx.pdb.input(lines=(input_2im9).splitlines(), source_info=None) pdb_hierarchy = pdb_in.construct_hierarchy() cif_block = pdb_hierarchy.as_cif_block_with_sequence( sequences=[sequence_2im9]) assert list(cif_block["_entity_poly_seq.mon_id"]) == [ 'SER', 'SER', 'PRO', 'THR', 'ILE', 'LYS', 'GLY', 'ILE', 'ASN', 'ILE', 'GLN', 'VAL', 'VAL', 'LEU', 'PRO', 'GLU', 'LYS', 'PRO', 'VAL', 'SER', 'ASN', 'GLY', 'CYS', 'GLN', 'LEU', 'PHE', 'ASP', 'ILE', 'ARG'] assert list(cif_block["_atom_site.label_seq_id"]) == [ '18', '19', '23', '24', '25', '26']
def correct_sequence (pdb_hierarchy, sequences, truncate_to_cbeta=False, out=sys.stdout) : """ Modify the sequence for the pdb hierarchy to match that of the aligned sequence. This will remove incompatible atoms; the sidechains will still need to be extended separated. For proteins only - mismatches in nucleic acids will only result in a warning. :param pdb_hierarchy: iotbx.pdb.hierarchy.root object :param sequences: list of iotbx.bioinformatics.sequence objects :param trucate_to_cbeta: chop off entire sidechain to C-beta (default: leave common atoms in place) :param out: output filehandle (default = stdout) :returns: number of atom_group objects renamed """ from mmtbx.monomer_library import idealized_aa import mmtbx.validation.sequence from iotbx.pdb.amino_acid_codes import three_letter_given_one_letter seq_validation = mmtbx.validation.sequence.validation( pdb_hierarchy=pdb_hierarchy, sequences=sequences, log=out) for chain_seq in seq_validation.chains : if (chain_seq.chain_type == mmtbx.validation.sequence.NUCLEIC_ACID) : if (len(chain_seq.mismatch) > 0) : print >> out, \ " WARNING: will skip %d mismatches in nucleic acid chain '%s'" % \ chain_seq.chain_id res_dict = idealized_aa.residue_dict() expected_names = {} for resname in res_dict.keys() : if (not "_h" in resname) : ideal_res = res_dict[resname] expected_names[resname] = set([ a.name for a in ideal_res.atoms() ]) n_changed = 0 for chain in pdb_hierarchy.only_model().chains() : if (not chain.is_protein()) : continue for chain_seq in seq_validation.chains : if (chain.id == chain_seq.chain_id) and (len(chain_seq.mismatch) > 0) : for residue_group in chain.residue_groups() : resid = residue_group.resid() if (resid in chain_seq.mismatch) : idx = chain_seq.mismatch.index(resid) new_code = chain_seq.actual_code[idx] new_resname = three_letter_given_one_letter.get(new_code) if (new_resname is not None) : expected_atoms = expected_names[new_resname.lower()] if (truncate_to_cbeta) : expected_atoms = expected_names["ala"] for atom_group in residue_group.atom_groups() : n_changed += 1 n_removed = 0 atom_group.resname = new_resname for atom in atom_group.atoms() : if (not atom.name in expected_atoms) : atom_group.remove_atom(atom) n_removed += 1 print >> out, " chain '%s' %s %s --> %s (%d atoms removed)" % \ (chain.id, resid, residue_group.atom_groups()[0].resname, new_resname, n_removed) pdb_hierarchy.atoms().reset_i_seq() return n_changed