def create_constraints_file(preminimization_log, outfile_path): '''This does the work of the convert_to_cst_file.sh script in the Rosetta repository.''' constraints = [] contents = read_file(preminimization_log) for line in contents.split('\n'): if line.startswith("c-alpha"): line = line.split() constraints.append("AtomPair CA %s CA %s HARMONIC %s %s" % (line[5], line[7], line[9], line[12])) write_file(outfile_path, '\n'.join(constraints)) return outfile_path
def update_pdbs_json(): '''This function was used to update the pdbs.json file to include chain sequences and types.''' pdb_data = {} pdb_data_ = json.loads(read_file(os.path.join('..', 'json', 'pdbs.json'))) for k, v in pdb_data_.items(): assert(len(k) == 4) newk = k.upper() pdb = PDB(read_file(os.path.join('..', 'pdbs', newk + '.pdb'))) chain_ids = set(pdb.chain_types.keys()).union(set(pdb.seqres_chain_order)).union(set(pdb.atom_sequences.keys())) v['Chains'] = dict.fromkeys(chain_ids) for chain_id in chain_ids: v['Chains'][chain_id] = dict( Sequence = str(pdb.atom_sequences.get(chain_id)), Type = pdb.chain_types.get(chain_id), ) pdb_data[newk] = v write_file(os.path.join('..', 'json', 'pdbs.json.new'), json.dumps(pdb_data, indent = 4, sort_keys=True))
def update_pdbs_json(): '''This function was used to update the pdbs.json file to include chain sequences and types.''' pdb_data = {} pdb_data_ = json.loads(read_file(os.path.join('..', 'json', 'pdbs.json'))) for k, v in pdb_data_.iteritems(): assert(len(k) == 4) newk = k.upper() pdb = PDB(read_file(os.path.join('..', 'pdbs', newk + '.pdb'))) chain_ids = set(pdb.chain_types.keys()).union(set(pdb.seqres_chain_order)).union(set(pdb.atom_sequences.keys())) v['Chains'] = dict.fromkeys(chain_ids) for chain_id in chain_ids: v['Chains'][chain_id] = dict( Sequence = str(pdb.atom_sequences.get(chain_id)), Type = pdb.chain_types.get(chain_id), ) pdb_data[newk] = v write_file(os.path.join('..', 'json', 'pdbs.json.new'), json.dumps(pdb_data, indent = 4, sort_keys=True))
for chain_id, sequence in stripped_pdb.atom_sequences.iteritems(): assert(len(sequence) > 0) # Check for CSE and MSE try: if 'CSE' in stripped_pdb.residue_types: raise Exception('This case contains a CSE residue which may (or may not) cause an issue with Rosetta depending on the version.') elif 'MSE' in stripped_pdb.residue_types: raise Exception('This case contains an MSE residue which may (or may not) cause an issue with Rosetta depending on the version.') # It looks like MSE (and CSE?) may now be handled - https://www.rosettacommons.org/content/pdb-files-rosetta-format except Exception, e: print('%s: %s, chain %s' % (str(e), str(stripped_pdb.pdb_id), chain)) # Turn the lines array back into a valid PDB file if not(skip_if_exists) or not(os.path.exists(stripped_pdb_path)): write_file(stripped_pdb_path, '\n'.join(stripped_pdb.lines)) # Create the mapping between PDB and Rosetta residue numbering # Note: In many Rosetta protocols, '-ignore_unrecognized_res' and '-ignore_zero_occupancy false' are used to allow # Rosetta to work with structures with missing data and non-canonicals. In those cases, we should supply both flags # in the string below. Since protocol 16 only uses '-ignore_unrecognized_res', we only use that flag below as otherwise # we could break the mapping. rosetta_scripts_bin = os.path.join(settings['local_rosetta_bin'], 'rosetta_scripts%s' % settings['rosetta_binary_type']) rosetta_database_path = settings['local_rosetta_db_dir'] if not os.path.exists(rosetta_scripts_bin): raise Exception('The Rosetta scripts executable "{0}" could not be found. Please check your configuration file.'.format(rosetta_database_path)) if not os.path.exists(rosetta_database_path): raise Exception('The path to the Rosetta database "{0}" could not be found. Please check your configuration file.'.format(rosetta_database_path)) stripped_pdb.construct_pdb_to_rosetta_residue_map(rosetta_scripts_bin,rosetta_database_path, extra_command_flags = '-ignore_unrecognized_res') atom_to_rosetta_residue_map = stripped_pdb.get_atom_sequence_to_rosetta_json_map() rosetta_to_atom_residue_map = stripped_pdb.get_rosetta_sequence_to_atom_json_map()
def create_input_files(job_dict, settings, pdb_dir_path, pdb_data_dir, mutfile_data_dir, keypair, dataset_cases, skip_if_exists=False): '''Create the stripped PDB files and the mutfiles for the DDG step. Mutfiles are created at this point as we need the original PDB to generate the residue mapping. ''' # Read PDB pdb_id = keypair[0] chain = keypair[1] pdb = PDB.from_filepath(pdb_dir_path) stripped_pdb_path = os.path.join(pdb_data_dir, '%s_%s.pdb' % (pdb_id, chain)) # Strip the PDB to the list of chains. This also renumbers residues in the PDB for Rosetta. chains = [chain] pdb.strip_to_chains(chains) pdb.strip_HETATMs() stripped_pdb = PDB('\n'.join(pdb.lines)) # Check to make sure that we haven't stripped all the ATOM lines if not [line for line in stripped_pdb.lines if line[0:4] == "ATOM"]: raise Exception("No ATOM lines remain in the stripped PDB file %s." % stripped_pdb_path) # Assert that there are no empty sequences assert (sorted(stripped_pdb.atom_sequences.keys()) == sorted(chains)) for chain_id, sequence in stripped_pdb.atom_sequences.items(): assert (len(sequence) > 0) # Check for CSE and MSE try: if 'CSE' in stripped_pdb.residue_types: raise Exception( 'This case contains a CSE residue which may (or may not) cause an issue with Rosetta depending on the version.' ) elif 'MSE' in stripped_pdb.residue_types: raise Exception( 'This case contains an MSE residue which may (or may not) cause an issue with Rosetta depending on the version.' ) # It looks like MSE (and CSE?) may now be handled - https://www.rosettacommons.org/content/pdb-files-rosetta-format except Exception as e: print(('%s: %s, chain %s' % (str(e), str(stripped_pdb.pdb_id), chain))) # Turn the lines array back into a valid PDB file if not (skip_if_exists) or not (os.path.exists(stripped_pdb_path)): write_file(stripped_pdb_path, '\n'.join(stripped_pdb.lines)) # Create the mapping between PDB and Rosetta residue numbering # Note: In many Rosetta protocols, '-ignore_unrecognized_res' and '-ignore_zero_occupancy false' are used to allow # Rosetta to work with structures with missing data and non-canonicals. In those cases, we should supply both flags # in the string below. Since protocol 16 only uses '-ignore_unrecognized_res', we only use that flag below as otherwise # we could break the mapping. rosetta_scripts_bin = os.path.join( settings['local_rosetta_bin'], 'rosetta_scripts%s' % settings['rosetta_binary_type']) rosetta_database_path = settings['local_rosetta_db_dir'] if not os.path.exists(rosetta_scripts_bin): raise Exception( 'The Rosetta scripts executable "{0}" could not be found. Please check your configuration file.' .format(rosetta_database_path)) if not os.path.exists(rosetta_database_path): raise Exception( 'The path to the Rosetta database "{0}" could not be found. Please check your configuration file.' .format(rosetta_database_path)) stripped_pdb.construct_pdb_to_rosetta_residue_map( rosetta_scripts_bin, rosetta_database_path, extra_command_flags='-ignore_unrecognized_res') atom_to_rosetta_residue_map = stripped_pdb.get_atom_sequence_to_rosetta_json_map( ) rosetta_to_atom_residue_map = stripped_pdb.get_rosetta_sequence_to_atom_json_map( ) # Save the PDB <-> Rosetta residue mappings to disk write_file( os.path.join(pdb_data_dir, '%s_%s.rosetta2pdb.resmap.json' % (pdb_id, chain)), rosetta_to_atom_residue_map) write_file( os.path.join(pdb_data_dir, '%s_%s.pdb2rosetta.resmap.json' % (pdb_id, chain)), atom_to_rosetta_residue_map) # Assert that there are no empty sequences in the Rosetta-processed PDB file total_num_residues = 0 d = json.loads(rosetta_to_atom_residue_map) for chain_id in chains: num_chain_residues = len( [z for z in list(d.values()) if z[0] == chain_id]) total_num_residues += num_chain_residues assert (num_chain_residues > 0) # Check that the mutated positions exist and that the wild-type matches the PDB try: for dataset_case in dataset_cases: assert (dataset_case['PDBFileID'] == pdb_id) # Note: I removed a hack here for 1AJ3->1U5P mapping # The JSON file does not have the residue IDs in PDB format (5 characters including insertion code) so we need to repad them for the mapping to work pdb_mutations = [ ChainMutation(mutation['WildTypeAA'], PDB.ResidueID2String(mutation['ResidueID']), mutation['MutantAA'], Chain=mutation['Chain']) for mutation in dataset_case['Mutations'] ] stripped_pdb.validate_mutations(pdb_mutations) # Map the PDB mutations to Rosetta numbering which is used by the mutfile format rosetta_mutations = stripped_pdb.map_pdb_residues_to_rosetta_residues( pdb_mutations) if (len(rosetta_mutations) != len(pdb_mutations)) or (None in set( [m.ResidueID for m in rosetta_mutations])): raise Exception( 'An error occurred in the residue mapping code for DDG case: %s, %s' % (pdb_id, pdb_mutations)) # Create the mutfile mutfile = Mutfile.from_mutagenesis(rosetta_mutations) mutfilename = os.path.join( mutfile_data_dir, '%d.mutfile' % (dataset_case['RecordID'])) if os.path.exists(mutfilename): raise Exception( '%s already exists. Check that the RecordIDs in the JSON file are all unique.' % mutfilename) write_file( os.path.join(mutfile_data_dir, '%d.mutfile' % (dataset_case['RecordID'])), str(mutfile)) except Exception as e: print((str(e))) print((traceback.format_exc())) # Set up --in:file:l parameter pdb_relpath = os.path.relpath(stripped_pdb_path, settings['output_dir']) job_dict[os.path.join( task_subfolder, '_'.join(keypair))] = dict(input_file_list=[pdb_relpath]) sys.stdout.write('.') sys.stdout.flush()