def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict): pdb_loc = cfg[PDB_FILE] pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} # to allow warning to be printed once and only once missing_types = [] qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] with open(pdb_loc) as f: wat_count = 0 atom_count = 0 mol_count = 1 current_mol = None last_mol_num = None atoms_content = [] for line in f: line = line.strip() line_len = len(line) if line_len == 0: continue line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]] # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if line_head == 'REMARK' or line_head == 'CRYST1': pdb_data[HEAD_CONTENT].append(line) # atoms_content to contain everything but the xyz elif line_head == 'ATOM ': # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this: # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] # For renumbering, making sure prints in the correct format, including num of characters: atom_count += 1 # For reordering atoms if atom_count in atom_num_dict: atom_id = atom_num_dict[atom_count] else: atom_id = atom_count if atom_id > 99999: atom_num = format(atom_id, 'x') if len(atom_num) > 5: warning("Hex representation of {} is {}, which is greater than 5 characters. This" "will affect the PDB output formatting.".format(atom_id, atom_num)) else: atom_num = '{:5d}'.format(atom_id) atom_type = line[cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]] res_type = line[cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]] mol_num = int(line[cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float(line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]] element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]] last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:] # For user-specified changing of molecule number if mol_num in mol_num_dict: mol_num = mol_num_dict[mol_num] # If doing water molecule checking... if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]: if (wat_count % 3) == 0: current_mol = mol_num if atom_type != ' OH2 ': warning('Expected an OH2 atom to be the first atom of a water molecule. ' 'Check line: {}'.format(line)) # last_cols = ' 0.00 0.00 S2 O' else: if current_mol != mol_num: warning('Water not in order on line:', line) if (wat_count % 3) == 1: if atom_type != ' H1 ': warning('Expected an H1 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) else: if atom_type != ' H2 ': warning('Expected an H2 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) wat_count += 1 if mol_num in cfg[RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES: if atom_type == C_ALPHA: ca_res_atom_id_dict[mol_num] = atom_id else: if atom_type == C_BETA: cb_res_atom_id_dict[mol_num] = atom_id if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError("Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM)) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_id) else: qmmm_elem_id_dict[element] = [atom_id] atoms_for_vmd.append(atom_id - 1) if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]: if atom_type in element_dict: element = element_dict[atom_type] else: if atom_type not in missing_types: warning("Please add atom type '{}' to dictionary of elements. Will not write/overwrite " "element type in the pdb output.".format(atom_type)) missing_types.append(atom_type) # For numbering molecules from 1 to end if cfg[RENUM_MOL]: if last_mol_num is None: last_mol_num = mol_num if mol_num != last_mol_num: last_mol_num = mol_num mol_count += 1 if mol_count == 10000: warning("Molecule numbers greater than 9999 will be printed in hex") # Due to PDB format constraints, need to print in hex starting at 9999 molecules. if mol_count > 9999: mol_num = format(mol_count, 'x') if len(mol_num) > 4: warning("Hex representation of {} is {}, which is greater than 4 characters. This" "will affect the PDB output formatting.".format(atom_id, atom_num)) else: mol_num = '{:4d}'.format(mol_count) line_struct = [line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, occ_t, element, last_cols] atoms_content.append(line_struct) # tail_content to contain everything after the 'Atoms' section else: pdb_data[TAIL_CONTENT].append(line) # Only sort if there is renumbering if len(atom_num_dict) > 0: pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1]) else: pdb_data[ATOMS_CONTENT] = atoms_content if cfg[PDB_NEW_FILE] is None: f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR]) print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT], f_name, cfg[PDB_FORMAT]) if len(cfg[RESID_QMMM]) > 0: f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = 'a' print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
def get_evb_atoms(cfg, chk_file): with open(chk_file) as d: chk_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} section = SEC_HEAD o_ids = [] h_ids = [] for line in d: line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: chk_data[HEAD_CONTENT].append(line) atoms_match = NUM_ATOMS_PAT.match(line) if atoms_match: # regex is 1-based # print(atoms_match.group(1)) chk_data[NUM_ATOMS] = int(atoms_match.group(1)) section = SEC_ATOMS elif section == SEC_ATOMS: if len(line) == 0: continue split_line = line.split() index = int(split_line[0]) atom_num = int(split_line[1]) x, y, z = map(float, split_line[2:5]) atom_type = split_line[5] atom_struct = [index, atom_num, x, y, z, atom_type] chk_data[ATOMS_CONTENT].append(atom_struct) if atom_num > cfg[LAST_EXCLUDE_ID]: if atom_type == 'O': o_ids.append(atom_num) elif atom_type == 'H': h_ids.append(atom_num) else: raise InvalidDataError("Expected atom types are 'O' and 'H' (looking for water " "molecules only). Found type '{}' for line:\n {}\n" "Use the '{}' keyword to specify the last atom to exclude (i.e. " "the last protein atom)." "".format(atom_type, line, LAST_EXCLUDE_ID)) if len(chk_data[ATOMS_CONTENT]) == chk_data[NUM_ATOMS]: section = SEC_TAIL # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: break # Data validation: checking total charge num_o = len(o_ids) num_h = len(h_ids) total_charge = num_h - 2 * num_o if cfg[EXPECTED_CHARGE] is None: print("Found {} oxygen atoms and {} hydrogen atoms for a total charge of {}." "".format(num_o, num_h, add_sign(total_charge))) else: if total_charge != cfg[EXPECTED_CHARGE]: raise InvalidDataError("Expected a total charge of {} but found {} for file: {}" "".format(add_sign(cfg[EXPECTED_CHARGE]), add_sign(total_charge), chk_file)) # printing! f_name = create_out_fname(chk_file, prefix='water_', ext='.dat', base_dir=cfg[OUT_BASE_DIR], remove_prefix='CHK_') print_qm_kind(h_ids, 'H', f_name) print_qm_kind(o_ids, 'O', f_name, mode='a') f_name = create_out_fname(chk_file, prefix='vmd_water_', ext='.dat', base_dir=cfg[OUT_BASE_DIR], remove_prefix='CHK_') print_vmd_list(o_ids+h_ids, f_name)
def process_psf(cfg, atom_num_dict, mol_num_dict, element_dict, radii_dict): with open(cfg[PSF_FILE]) as f: psf_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} num_atoms_pat = re.compile(r"(\d+).*NATOM$") num_atoms = 1 section = SEC_HEAD # for printing qmmm info qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] types_for_mm_kind = set() qmmm_charge = 0 # for RENUM_MOL last_resid = None cur_mol_num = 0 for line in f.readlines(): s_line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: psf_data[HEAD_CONTENT].append(line.rstrip()) atoms_match = num_atoms_pat.match(s_line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) section = SEC_ATOMS elif section == SEC_ATOMS: if len(s_line) == 0: continue split_line = s_line.split() atom_num = int(split_line[0]) segid = split_line[1] resid = int(split_line[2]) resname = split_line[3] atom_type = split_line[4] charmm_type = split_line[5] charge = float(split_line[6]) atom_wt = float(split_line[7]) zero = split_line[8] # For reordering atoms if atom_num in atom_num_dict: atom_num = atom_num_dict[atom_num] # For user-specified changing of molecule number if resid in mol_num_dict: resid = mol_num_dict[resid] if cfg[RENUM_MOL]: if resid != last_resid: last_resid = resid cur_mol_num += 1 resid = cur_mol_num atom_struct = [ atom_num, segid, resid, resname, atom_type, charmm_type, charge, atom_wt, zero ] psf_data[ATOMS_CONTENT].append(atom_struct) if resid in cfg[RESID_QM] or resid in cfg[ RESID_QMMM] and atom_type not in cfg[SKIP_ATOM_TYPES]: if resid in cfg[RESID_QMMM]: if atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num if resid in cfg[RESID_QMMM] and atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num else: if resid in cfg[RESID_QMMM] and atom_type == C_BETA: cb_res_atom_id_dict[resid] = atom_num if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError( "Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM)) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_num) else: qmmm_elem_id_dict[element] = [atom_num] qmmm_charge += charge atoms_for_vmd.append(atom_num - 1) if cfg[PRINT_FOR_CP2K]: types_for_mm_kind.add(atom_type) if len(psf_data[ATOMS_CONTENT]) == num_atoms: section = SEC_TAIL # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: psf_data[TAIL_CONTENT].append(line.rstrip()) if len(atom_num_dict) > 0: warning( "This program does not yet edit any sections other than the atoms section." "If you are renumbering atoms, the bonds, angles, dihedrals, impropers, and" "cross-terms sections will not match.") psf_data[ATOMS_CONTENT] = sorted(psf_data[ATOMS_CONTENT], key=lambda entry: entry[0]) if cfg[RENUM_MOL] or len(atom_num_dict) + len(mol_num_dict) > 0: if cfg[PSF_NEW_FILE] is None: f_name = create_out_fname(cfg[PSF_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = cfg[PSF_NEW_FILE] list_to_file(psf_data[HEAD_CONTENT] + psf_data[ATOMS_CONTENT] + psf_data[TAIL_CONTENT], f_name, list_format=cfg[PSF_FORMAT]) if cfg[PRINT_FOR_CP2K]: print("Total charge from QM atoms: {:.2f}".format(qmmm_charge)) # create CP2K input listing amino atom ids f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = 'a' print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) # create CP2K input listing MM atom type radii f_name = create_out_fname('mm_kinds.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for atom_type in types_for_mm_kind: try: print_mm_kind(atom_type, radii_dict[atom_type], f_name, mode=print_mode) print_mode = 'a' except KeyError: warning( "Did not find atom type '{}' in the atom_type to radius dictionary: {}\n" " '{}' printed without this type; user may manually add its radius specification.\n" " To print this file with all MM types, use the keyword '{}' in the configuration file \n" " to identify a file with atom_type,radius (one per line, comma-separated) with all " "MM types in the psf".format(atom_type, cfg[RADII_DICT_FILE], 'mm_kinds.dat', RADII_DICT_FILE)) # create VMD input listing amino atom indexes (base-zero counting) f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
def process_file(cfg): chk_list_loc = cfg[CHK_FILE_LIST] num_atoms_pat = re.compile(r"^ATOMS (\d+).*") last_exclude_id = cfg[LAST_EXCLUDE_ID] with open(chk_list_loc) as f: for chk_file in f: chk_file = chk_file.strip() with open(chk_file) as d: chk_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} section = SEC_HEAD o_ids = [] h_ids = [] for line in d: line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: chk_data[HEAD_CONTENT].append(line) atoms_match = num_atoms_pat.match(line) if atoms_match: # regex is 1-based # print(atoms_match.group(1)) chk_data[NUM_ATOMS] = int(atoms_match.group(1)) section = SEC_ATOMS elif section == SEC_ATOMS: if len(line) == 0: continue split_line = line.split() index = int(split_line[0]) atom_num = int(split_line[1]) x, y, z = map(float, split_line[2:5]) atom_type = split_line[5] atom_struct = [index, atom_num, x, y, z, atom_type] chk_data[ATOMS_CONTENT].append(atom_struct) if atom_num > last_exclude_id: if atom_type == 'O': o_ids.append(atom_num) elif atom_type == 'H': h_ids.append(atom_num) else: raise InvalidDataError("Expected atom types are 'O' and 'H' (looking for water " "molecules only). Found type '{}' for line:\n {}\n" "Use the '{}' keyword to specify the last atom to exclude (i.e. " "the last protein atom)." "".format(atom_type, line, LAST_EXCLUDE_ID)) if len(chk_data[ATOMS_CONTENT]) == chk_data[NUM_ATOMS]: section = SEC_TAIL # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: break # Data validation: checking total charge num_o = len(o_ids) num_h = len(h_ids) total_charge = num_h - 2 * num_o if cfg[EXPECTED_CHARGE] is None: print("Found {} oxygen atoms and {} hydrogen atoms for a total charge of {}." "".format(num_o, num_h, add_sign(total_charge))) else: if total_charge != cfg[EXPECTED_CHARGE]: raise InvalidDataError("Expected a total charge of {} but found {} for file: {}" "".format(add_sign(cfg[EXPECTED_CHARGE]), add_sign(total_charge), chk_file)) # printing! f_name = create_out_fname(chk_file, prefix='water_', ext='.dat', base_dir=cfg[OUT_BASE_DIR], remove_prefix='CHK_') print_qm_kind(h_ids, 'H', f_name) print_qm_kind(o_ids, 'O', f_name, mode='a') f_name = create_out_fname(chk_file, prefix='vmd_water_', ext='.dat', base_dir=cfg[OUT_BASE_DIR], remove_prefix='CHK_') print_vmd_list(o_ids+h_ids, f_name)
def process_psf(cfg, atom_num_dict, mol_num_dict, element_dict, radii_dict): with open(cfg[PSF_FILE]) as f: psf_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} num_atoms_pat = re.compile(r"(\d+).*NATOM$") num_atoms = 1 section = SEC_HEAD # for printing qmmm info qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] types_for_mm_kind = set() qmmm_charge = 0 # for RENUM_MOL last_resid = None cur_mol_num = 0 for line in f.readlines(): s_line = line.strip() # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if section == SEC_HEAD: psf_data[HEAD_CONTENT].append(line.rstrip()) atoms_match = num_atoms_pat.match(s_line) if atoms_match: # regex is 1-based num_atoms = int(atoms_match.group(1)) section = SEC_ATOMS elif section == SEC_ATOMS: if len(s_line) == 0: continue split_line = s_line.split() atom_num = int(split_line[0]) segid = split_line[1] resid = int(split_line[2]) resname = split_line[3] atom_type = split_line[4] charmm_type = split_line[5] charge = float(split_line[6]) atom_wt = float(split_line[7]) zero = split_line[8] # For reordering atoms if atom_num in atom_num_dict: atom_num = atom_num_dict[atom_num] # For user-specified changing of molecule number if resid in mol_num_dict: resid = mol_num_dict[resid] if cfg[RENUM_MOL]: if resid != last_resid: last_resid = resid cur_mol_num += 1 resid = cur_mol_num atom_struct = [atom_num, segid, resid, resname, atom_type, charmm_type, charge, atom_wt, zero] psf_data[ATOMS_CONTENT].append(atom_struct) if resid in cfg[RESID_QM] or resid in cfg[RESID_QMMM] and atom_type not in cfg[SKIP_ATOM_TYPES]: if resid in cfg[RESID_QMMM]: if atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num if resid in cfg[RESID_QMMM] and atom_type == C_ALPHA: ca_res_atom_id_dict[resid] = atom_num else: if resid in cfg[RESID_QMMM] and atom_type == C_BETA: cb_res_atom_id_dict[resid] = atom_num if atom_type in element_dict: element = element_dict[atom_type] else: raise InvalidDataError( "Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type, ELEMENT_DICT_FILE, RESID_QMMM) ) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_num) else: qmmm_elem_id_dict[element] = [atom_num] qmmm_charge += charge atoms_for_vmd.append(atom_num - 1) if cfg[PRINT_FOR_CP2K]: types_for_mm_kind.add(atom_type) if len(psf_data[ATOMS_CONTENT]) == num_atoms: section = SEC_TAIL # tail_content to contain everything after the 'Atoms' section elif section == SEC_TAIL: psf_data[TAIL_CONTENT].append(line.rstrip()) if len(atom_num_dict) > 0: warning( "This program does not yet edit any sections other than the atoms section." "If you are renumbering atoms, the bonds, angles, dihedrals, impropers, and" "cross-terms sections will not match." ) psf_data[ATOMS_CONTENT] = sorted(psf_data[ATOMS_CONTENT], key=lambda entry: entry[0]) if cfg[RENUM_MOL] or len(atom_num_dict) + len(mol_num_dict) > 0: if cfg[PSF_NEW_FILE] is None: f_name = create_out_fname(cfg[PSF_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = cfg[PSF_NEW_FILE] list_to_file( psf_data[HEAD_CONTENT] + psf_data[ATOMS_CONTENT] + psf_data[TAIL_CONTENT], f_name, list_format=cfg[PSF_FORMAT], ) if cfg[PRINT_FOR_CP2K]: print("Total charge from QM atoms: {:.2f}".format(qmmm_charge)) # create CP2K input listing amino atom ids f_name = create_out_fname("amino_id.dat", base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in qmmm_elem_id_dict: print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = "a" print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) # create CP2K input listing MM atom type radii f_name = create_out_fname("mm_kinds.dat", base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for atom_type in types_for_mm_kind: try: print_mm_kind(atom_type, radii_dict[atom_type], f_name, mode=print_mode) print_mode = "a" except KeyError: warning( "Did not find atom type '{}' in the atom_type to radius dictionary: {}\n" " '{}' printed without this type; user may manually add its radius specification.\n" " To print this file with all MM types, use the keyword '{}' in the configuration file \n" " to identify a file with atom_type,radius (one per line, comma-separated) with all " "MM types in the psf".format(atom_type, cfg[RADII_DICT_FILE], "mm_kinds.dat", RADII_DICT_FILE) ) # create VMD input listing amino atom indexes (base-zero counting) f_name = create_out_fname("vmd_protein_atoms.dat", base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=" ")