def retrieve(pdb_id, cache_dir = None, bio_cache = None): '''Creates a FASTA object by using a cached copy of the file if it exists or by retrieving the file from the RCSB.''' pdb_id = pdb_id.upper() if bio_cache: return FASTA(bio_cache.get_fasta_contents(pdb_id)) # Check to see whether we have a cached copy if cache_dir: filename = os.path.join(cache_dir, "%s.fasta" % pdb_id) if os.path.exists(filename): return FASTA(read_file(filename)) else: filename += ".txt" if os.path.exists(filename): return FASTA(read_file(filename)) # Get a copy from the RCSB contents = rcsb.retrieve_fasta(pdb_id) # Create a cached copy if appropriate if cache_dir: write_file(os.path.join(cache_dir, "%s.fasta" % pdb_id), contents) # Return the object return FASTA(contents)
def retrieve(pdb_id, cache_dir = None): '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB.''' pdb_contents = None xml_contents = None pdb_id = pdb_id.upper() if cache_dir: # Check to see whether we have a cached copy of the PDB file filename = os.path.join(cache_dir, "%s.pdb" % pdb_id) if os.path.exists(filename): pdb_contents = read_file(filename) # Check to see whether we have a cached copy of the XML file filename = os.path.join(cache_dir, "%s.xml" % pdb_id) if os.path.exists(filename): xml_contents = read_file(filename) # Get any missing files from the RCSB and create cached copies if appropriate if not pdb_contents: pdb_contents = rcsb.retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) if not xml_contents: xml_contents = rcsb.retrieve_xml(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.xml" % pdb_id), xml_contents) # Return the object return PDBML_slow(xml_contents, pdb_contents)
def create_pruned_structures(output_directory): loop_definitions_12 = json.loads( read_file('../structures/12_res/loop_definitions.json')) prepare_structures('../structures/12_res/rcsb/reference/*.pdb', os.path.join(output_directory, '12_res_rcsb'), loop_definitions_12, expected_min_loop_length=12, expected_max_loop_length=12) prepare_structures('../structures/12_res/rosetta/preminimized/*.pdb', os.path.join(output_directory, '12_res_rosetta'), loop_definitions_12, expected_min_loop_length=12, expected_max_loop_length=12) loop_definitions_14_17 = json.loads( read_file('../structures/14_17_res/loop_definitions.json')) prepare_structures('../structures/14_17_res/rcsb/reference/*.pdb', os.path.join(output_directory, '14_17_res'), loop_definitions_14_17, expected_min_loop_length=14, expected_max_loop_length=17, remove_hetatm=True) prepare_structures('../structures/14_17_res/rosetta/reference/*.pdb', os.path.join(output_directory, '14_17_res'), loop_definitions_14_17, expected_min_loop_length=14, expected_max_loop_length=17, remove_hetatm=True)
def add_backbone_atoms_linearly_from_loop_filepaths(self, loop_json_filepath, fasta_filepath, residue_ids): '''A utility wrapper around add_backbone_atoms_linearly. Adds backbone atoms in a straight line from the first to the last residue of residue_ids. loop_json_filepath is a path to a JSON file using the JSON format for Rosetta loops files. This file identifies the insertion points of the sequence. fasta_filepath is a path to a FASTA file with one sequence. This sequence will be used as the sequence for the inserted residues (between the start and stop residues defined in loop_json_filepath). residue_ids is a list of PDB chain residues (columns 22-27 of ATOM lines in the PDB format). It is assumed that they are sequential although the logic does not depend on that. This list should have the length length as the sequence identified in the FASTA file. ''' # Parse the loop file loop_def = json.loads(read_file(loop_json_filepath)) assert(len(loop_def['LoopSet']) == 1) start_res = loop_def['LoopSet'][0]['start'] end_res = loop_def['LoopSet'][0]['stop'] start_res = PDB.ChainResidueID2String(start_res['chainID'], (str(start_res['resSeq']) + start_res['iCode']).strip()) end_res = PDB.ChainResidueID2String(end_res['chainID'], (str(end_res['resSeq']) + end_res['iCode']).strip()) assert(start_res in residue_ids) assert(end_res in residue_ids) # Parse the FASTA file and extract the sequence f = FASTA(read_file(fasta_filepath), strict = False) assert(len(f.get_sequences()) == 1) insertion_sequence = f.sequences[0][2] if not len(residue_ids) == len(insertion_sequence): raise Exception('The sequence in the FASTA file must have the same length as the list of residues.') # Create the insertion sequence (a sub-sequence of the FASTA sequence) # The post-condition is that the start and end residues are the first and last elements of kept_residues respectively kept_residues = [] insertion_residue_map = {} in_section = False found_end = False for x in range(len(residue_ids)): residue_id = residue_ids[x] if residue_id == start_res: in_section = True if in_section: kept_residues.append(residue_id) insertion_residue_map[residue_id] = insertion_sequence[x] if residue_id == end_res: found_end = True break if not kept_residues: raise Exception('The insertion sequence is empty (check the start and end residue ids).') if not found_end: raise Exception('The end residue was not encountered when iterating over the insertion sequence (check the start and end residue ids).') # Identify the start and end Residue objects try: start_res = self.residues[start_res[0]][start_res[1:]] end_res = self.residues[end_res[0]][end_res[1:]] except Exception, e: raise Exception('The start or end residue could not be found in the PDB file.')
def retrieve(pdb_id, cache_dir = None, acceptable_sequence_percentage_match = 70.0, require_uniprot_residue_mapping = True, bio_cache = None): '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB. bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly. ''' pdb_contents = None xml_contents = None pdb_id = pdb_id.upper() l_pdb_id = pdb_id.lower() if len(pdb_id) != 4 or not pdb_id.isalnum(): raise Exception("Bad PDB identifier '%s'." % pdb_id) if bio_cache: pdb_contents = bio_cache.get_pdb_contents(pdb_id) xml_contents = bio_cache.get_sifts_xml_contents(pdb_id) if cache_dir: if not pdb_contents: # Check to see whether we have a cached copy of the PDB file filename = os.path.join(cache_dir, "%s.pdb" % pdb_id) if os.path.exists(filename): pdb_contents = read_file(filename) if not xml_contents: # Check to see whether we have a cached copy of the XML file filename = os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id) if os.path.exists(filename): xml_contents = read_file(filename) # Get any missing files from the RCSB and create cached copies if appropriate if not pdb_contents: pdb_contents = rcsb.retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) if not xml_contents: try: xml_contents = retrieve_xml(pdb_id, silent = False) if cache_dir: write_file(os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id), xml_contents) except FTPException550: raise MissingSIFTSRecord('The file "%s.sifts.xml.gz" could not be found on the EBI FTP server.' % l_pdb_id) xml_contents = xml_contents # Return the object handler = SIFTS(xml_contents, pdb_contents, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = cache_dir, require_uniprot_residue_mapping = require_uniprot_residue_mapping, bio_cache = bio_cache, pdb_id = pdb_id) xml.sax.parseString(xml_contents, handler) return handler
def __init__(self): pdb_chain_to_pfam_mapping = {} pfam_to_pdb_chain_mapping = {} lines = read_file(pdb_to_pfam_mapping_file).split('\n') for c in range(len(lines)): if not lines[c].startswith('#'): break assert(lines[c].split() == ['PDB', 'CHAIN', 'SP_PRIMARY', 'PFAM_ID']) for l in lines[c:]: if l.strip(): tokens = l.split() pdb_id = tokens[0].lower() chain_id = tokens[1] pfam_acc = tokens[3] pdb_key = (pdb_id, chain_id) pdb_chain_to_pfam_mapping[pdb_id] = pdb_chain_to_pfam_mapping.get(pdb_id, {}) pdb_chain_to_pfam_mapping[pdb_id][chain_id] = pdb_chain_to_pfam_mapping[pdb_id].get(chain_id, set()) pdb_chain_to_pfam_mapping[pdb_id][chain_id].add(pfam_acc) pfam_to_pdb_chain_mapping[pfam_acc] = pfam_to_pdb_chain_mapping.get(pfam_acc, set()) pfam_to_pdb_chain_mapping[pfam_acc].add(pdb_key) self.pdb_chain_to_pfam_mapping = pdb_chain_to_pfam_mapping self.pfam_to_pdb_chain_mapping = pfam_to_pdb_chain_mapping
def download_fasta(pdb_id, dest_dir, silent = True, filename = None): assert(os.path.exists(dest_dir)) lower_case_filename = os.path.join(dest_dir, '{0}.fasta'.format(pdb_id.lower())) upper_case_filename = os.path.join(dest_dir, '{0}.fasta'.format(pdb_id.upper())) if filename: requested_filename = os.path.join(dest_dir, filename) if os.path.exists(requested_filename): return read_file(requested_filename) if os.path.exists(lower_case_filename): return read_file(lower_case_filename) elif os.path.exists(upper_case_filename): return read_file(upper_case_filename) else: contents = retrieve_fasta(pdb_id, silent = silent) write_file(os.path.join(dest_dir, filename or '{0}.fasta'.format(pdb_id)), contents) return contents
def compute(self): tmp_dir = self.tmp_dir if not self.read_only: pdb_object = self.pdb.clone() else: pdb_object = self.pdb # in general, we should not be modifying the structure in this class input_filepath = write_temp_file(tmp_dir, pdb_object.get_content(), ftype = 'w', prefix = 'dssp_') output_filepath = write_temp_file(tmp_dir, '', ftype = 'w', prefix = 'dssp_') try: p = _Popen('.', shlex.split('mkdssp -i {input_filepath} -o {output_filepath}'.format(**locals()))) if p.errorcode: if p.stderr.find('empty protein, or no valid complete residues') != -1: raise MissingAtomException(p.stdout) else: raise Exception('An error occurred while calling DSSP:\n%s' % p.stderr) self.dssp_output = read_file(output_filepath) self.dssp = self.parse_output() except MissingAtomException as e: os.remove(input_filepath) os.remove(output_filepath) raise except Exception as e: os.remove(input_filepath) os.remove(output_filepath) raise colortext.Exception('%s\n%s' % (str(e), traceback.format_exc())) os.remove(input_filepath) os.remove(output_filepath)
def update_public_datasets(): dsets = ['alascan-gpk.json', 'curatedprotherm.json', 'guerois.json', 'kellogg.json', 'potapov.json'] source_path = '../rawdata/' dest_path = '/home/oconchus/t14benchmarking/ddg/input/json/' for dset in dsets: assert(os.path.exists(os.path.join(source_path, dset))) assert(os.path.exists(os.path.join(dest_path, dset))) for dset in dsets: print(dset) source_set = json.loads(read_file(os.path.join(source_path, dset))) dest_set = json.loads(read_file(os.path.join(dest_path, dset))) assert(len(source_set['data']) == len(dest_set['data'])) for x in range(len(source_set['data'])): assert(dest_set['data'][x]['RecordID'] == source_set['data'][x]['RecordID']) dest_set['data'][x]['DerivedMutation'] = source_set['data'][x]['DerivedMutation'] write_file(os.path.join(dest_path, dset) + '.new', json.dumps(dest_set, indent=4, sort_keys=True))
def compute(self, chain_id): tmp_dir = self.tmp_dir pdb_object = self.pdb.clone() # we are immediately modifying the PDB file by stripping chains so we need to make a copy pdb_object.strip_to_chains(chain_id) input_filepath = write_temp_file(tmp_dir, pdb_object.get_content(), ftype = 'w', prefix = 'dssp_') output_filepath = write_temp_file(tmp_dir, '', ftype = 'w', prefix = 'dssp_') try: p = _Popen('.', shlex.split('mkdssp -i {input_filepath} -o {output_filepath}'.format(**locals()))) if p.errorcode: if p.stderr.find('empty protein, or no valid complete residues') != -1: raise MissingAtomException(p.stdout) else: raise Exception('An error occurred while calling DSSP:\n%s' % p.stderr) self.dssp_output[chain_id] = read_file(output_filepath) self.dssp[chain_id] = self.parse_output(chain_id) except MissingAtomException as e: os.remove(input_filepath) os.remove(output_filepath) raise except Exception as e: os.remove(input_filepath) os.remove(output_filepath) raise colortext.Exception('%s\n%s' % (str(e), traceback.format_exc())) os.remove(input_filepath) os.remove(output_filepath)
def _runRScript(r_script_filename, cwd = '.', remove_output = True): # Reset to new current working directory tmp_dir = False if cwd == None: tmp_dir = True cwd = tempfile.mkdtemp( prefix = '%s-%s-%s_' % (time.strftime("%y%m%d"), getpass.getuser(), 'plot-working-dir') ) rscriptname = write_temp_file(cwd, r_script_filename) p = subprocess.Popen(["R", "CMD", "BATCH", rscriptname], cwd = cwd) while True: time.sleep(0.3) errcode = p.poll() if errcode != None: break rout = "%s.Rout" % rscriptname os.remove(rscriptname) rout_contents = None if os.path.exists(rout): rout_contents = read_file(rout) os.remove(rout) if errcode != 0: print(rout_contents ) raise Exception("The R script failed with error code %d." % errcode) if tmp_dir and remove_output: shutil.rmtree(cwd) return rout_contents
def load_sequence_blast(self, sequence, cut_off, matrix, sequence_identity_cut_off): if self.cache_dir: filepath = self._get_blast_sequence_filepath(sequence, cut_off, matrix, sequence_identity_cut_off) if os.path.exists(filepath): for sequence_hits in json.loads(read_file(filepath)): if sequence_hits['sequence'] == sequence: return sequence_hits return None
def download_xml(pdb_id, dest_dir, silent = True, filename = None, unzip = False): assert(os.path.exists(dest_dir)) lower_case_gz_filename = os.path.join(dest_dir, '{0}.sifts.xml.gz'.format(pdb_id.lower())) upper_case_gz_filename = os.path.join(dest_dir, '{0}.sifts.xml.gz'.format(pdb_id.upper())) lower_case_filename = os.path.join(dest_dir, '{0}.sifts.xml'.format(pdb_id.lower())) upper_case_filename = os.path.join(dest_dir, '{0}.sifts.xml'.format(pdb_id.upper())) if filename: requested_filename = os.path.join(dest_dir, filename) if os.path.exists(requested_filename): return read_file(requested_filename) if unzip == True: if os.path.exists(lower_case_filename): contents = read_file(lower_case_filename) elif os.path.exists(upper_case_filename): contents = read_file(upper_case_filename) elif os.path.exists(lower_case_gz_filename): contents = read_gzip_in_memory(read_file(lower_case_gz_filename)) elif os.path.exists(upper_case_gz_filename): contents = read_gzip_in_memory(read_file(upper_case_gz_filename)) else: contents = retrieve_xml(pdb_id, silent = silent, unzip = True) write_file(os.path.join(dest_dir, filename or '{0}.sifts.xml'.format(pdb_id)), contents) return contents else: if os.path.exists(lower_case_gz_filename): contents = read_file(lower_case_gz_filename) # Note: read_file already unzips .gz files if os.path.exists(upper_case_gz_filename): contents = read_file(upper_case_gz_filename) # Note: read_file already unzips .gz files else: gzip_contents = retrieve_xml(pdb_id, silent = silent, unzip = False) write_file(os.path.join(dest_dir, filename or '{0}.sifts.xml.gz'.format(pdb_id)), gzip_contents) contents = read_gzip_in_memory(gzip_contents) return contents
def correlation_coefficient_gplot(inputfname, output_filename, filetype, experiment_field = "Experimental", title = ''): '''File suffix: pearsons_r_gplot Description: Pearson's r Filename: ggplot_pearsons.R Priority: 1 ''' script_path = os.path.abspath(os.path.dirname(inspect.getsourcefile(sys.modules[__name__]))) r_script_filename = read_file(os.path.join(script_path, "ggplot_pearsons.R")) % vars() return RInterface._runRScript(r_script_filename)
def sendgmail2(self, subject, recipients, plaintext, htmltext=None, cc=None, debug=False, useMIMEMultipart=True, gmail_account = '*****@*****.**', pw_filepath = None): '''For this function to work, the password for the gmail user must be colocated with this file or passed in.''' smtpserver = smtplib.SMTP("smtp.gmail.com", 587) smtpserver.ehlo() smtpserver.starttls() smtpserver.ehlo gmail_account = '*****@*****.**' if pw_filepath: smtpserver.login(gmail_account, read_file(pw_filepath)) else: smtpserver.login(gmail_account, read_file('pw')) for recipient in recipients: header = 'To:' + recipient + '\n' + 'From: ' + gmail_account + '\n' + 'Subject:' + subject + '\n' if htmltext: msg = header + '\n ' + htmltext + '\n\n' else: msg = header + '\n ' + plaintext + '\n\n' smtpserver.sendmail(gmail_account, recipient, msg) smtpserver.close()
def generate_schema_diagram(self, output_filepath = None, show_fk_only = False): if self.num_tables == 0: raise EmptyDiagramException('No tables in schema.') tempfiles = self._generate_schema_diagram(show_fk_only) self.schema_diagram = read_file(tempfiles[1]) for fname in tempfiles: if os.path.exists(fname): os.remove(fname) if output_filepath: write_file(output_filepath, self.schema_diagram)
def load(): global sys_settings if not sys_settings: settings_file = os.path.splitext(os.path.abspath(__file__))[0] + '.json' if not os.path.exists(settings_file): create_template(settings_file) colortext.warning('\nThe settings file {0} needs to be configured. Exiting.\n'.format(settings_file)) sys.exit(1) d = json.loads(read_file(settings_file)) sys_settings = NestedBunch(d) return sys_settings
def retrieve_data_from_rcsb(cls, ligand_code, pdb_id = None, silent = True, cached_dir = None): '''Retrieve a file from the RCSB.''' if not silent: colortext.printf("Retrieving data from RCSB") if cached_dir: assert(os.path.exists(cached_dir)) ligand_info_path, ligand_info, pdb_ligand_info, pdb_ligand_info_path = None, None, None, None if cached_dir: ligand_info_path = os.path.join(cached_dir, '{0}.cif'.format(ligand_code)) if os.path.exists(ligand_info_path): ligand_info = read_file(ligand_info_path) if not ligand_info: ligand_info = retrieve_ligand_cif(ligand_code) if cached_dir: write_file(ligand_info_path, ligand_info) # Parse .cif l = cls(ligand_code) l.parse_cif(ligand_info) l.pdb_id = pdb_id or l.pdb_id has_pdb_id = l.pdb_id and (len(l.pdb_id) == 4) and (l.pdb_id != '?') # the last case is unnecessary and will be short-cut but I included it to show possible values # Parse PDB XML if has_pdb_id: if cached_dir: pdb_ligand_info_path = os.path.join(cached_dir, '{0}.pdb.ligandinfo'.format(l.pdb_id.lower())) if os.path.exists(pdb_ligand_info_path): pdb_ligand_info = read_file(pdb_ligand_info_path) else: pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id) write_file(pdb_ligand_info_path, pdb_ligand_info) else: pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id) if pdb_ligand_info: l.parse_pdb_ligand_info(pdb_ligand_info) # Retrive the diagram image l.get_diagram() return l
def save_sequence_blast(self, sequence, cut_off, matrix, sequence_identity_cut_off, data): assert(data['sequence'] == sequence) sequence_data = [data] # put the new hit at the start of the file if self.cache_dir: filepath = self._get_blast_sequence_filepath(sequence, cut_off, matrix, sequence_identity_cut_off) if os.path.exists(filepath): for sequence_hits in json.loads(read_file(filepath)): if sequence_hits['sequence'] != sequence: sequence_data.append(sequence_hits) write_file(filepath, json.dumps(sequence_data)) return True return False
def dump_pdbs(): pdbs = json.loads(read_file('../rawdata/pdbs.json')) # Sanity check for pdb_id, v in sorted(pdbs.iteritems()): records = ddGdb.execute_select('SELECT ID FROM PDBFile WHERE ID=%s', parameters=(pdb_id,)) assert(len(records) == 1) # Dump for pdb_id, v in sorted(pdbs.iteritems()): content = ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))[0]['Content'] write_file('../rawdata/%s.pdb' % pdb_id, content)
def __init__(self, UniProtAC, XML = None, cache_dir = None, silent = True): if cache_dir and not(os.path.exists(cache_dir)): raise Exception("The cache directory %s does not exist." % cache_dir) self.UniProtAC = UniProtAC self.silent = silent # Get XML if XML == None: protein_xml = None cached_filepath = None if cache_dir: cached_filepath = os.path.join(cache_dir, '%s.xml' % UniProtAC) if cached_filepath and os.path.exists(cached_filepath): protein_xml = read_file(cached_filepath) else: if not silent: colortext.write("Retrieving %s\n" % UniProtAC, "cyan") url = 'http://www.uniprot.org/uniprot/%s.xml' % UniProtAC protein_xml = http_get(url) if not(protein_xml.strip()): raise EmptyUniProtACXMLException('The file %s is empty.' % UniProtAC) if cached_filepath: write_file(cached_filepath, protein_xml) self.XML = protein_xml else: self.XML = XML self.recommended_name = None self.submitted_names = [] self.alternative_names = [] # Get DOM try: self._dom = parseString(protein_xml) except: if cached_filepath: raise Exception("The UniProtAC XML for '%s' was invalid. The cached file is located at %s. Check this file - if it is not valid XML then delete the file and rerun the script." % (UniProtAC, cached_filepath)) else: raise Exception("The UniProtAC XML for '%s' was invalid." % UniProtAC) main_tags = self._dom.getElementsByTagName("uniprot") assert(len(main_tags) == 1) entry_tags = main_tags[0].getElementsByTagName("entry") assert(len(entry_tags) == 1) self.entry_tag = entry_tags[0] self._parse_evidence_tag() self._parse_sequence_tag() self._parse_protein_tag() self._parse_organism_tag() self._parse_subsections() self._parse_PDB_mapping()
def get_schema(self, host, username, passwd, database_name): try: outfile, outfilename = open_temp_file('/tmp', "w") p = subprocess.Popen(shlex.split("mysqldump -h %s -u %s -p%s --skip-add-drop-table --no-data %s" % (host, username, passwd, database_name)), stdout=outfile) p.wait() outfile.close() contents = read_file(outfilename) os.remove(outfilename) return contents except Exception, e: if os.path.exists(outfilename): os.remove(outfilename) raise
def load(): global sys_settings if not sys_settings: settings_file = os.path.splitext( os.path.abspath(__file__))[0] + '.json' if not os.path.exists(settings_file): create_template(settings_file) colortext.warning( '\nThe settings file {0} needs to be configured. Exiting.\n'. format(settings_file)) sys.exit(1) d = json.loads(read_file(settings_file)) sys_settings = NestedBunch(d) return sys_settings
def __init__(self, settings = {}, isInnoDB=True, numTries=32, host=None, db=None, user=None, passwd=None, port=3306, unix_socket="/var/lib/mysql/mysql.sock", passwdfile=None, use_utf=False): self.db = db self.host = host self.original_schema = [] if not(os.path.exists(unix_socket)): unix_socket = '/var/run/mysqld/mysqld.sock' # Ubuntu hack if not passwd and passwdfile: if os.path.exists(passwdfile): passwd = read_file(passwdfile).strip() else: passwd = getpass.getpass("Enter password to connect to MySQL database:") dbinterface = MySQLDatabaseInterface(settings, isInnoDB = isInnoDB, numTries = numTries, host = host, db = db, user = user, passwd = passwd, port = port, unix_socket = unix_socket, use_locking = False) # Get the DB schema, normalizing for sqlt-diagram db_schema = [] self.num_tables = 0 try: for t in sorted(dbinterface.TableNames): creation_string = dbinterface.execute_select('SHOW CREATE TABLE `%s`' % t) assert(len(creation_string) == 1) if creation_string[0].get('Create Table') == None: # e.g. for views continue self.num_tables += 1 creation_string = '%s;' % creation_string[0]['Create Table'].strip() self.original_schema.append(creation_string) # Fix input for sqlt-diagram (it is fussy) creation_string = creation_string.replace("default ''", "") creation_string = creation_string.replace("DEFAULT ''", "") creation_string = creation_string.replace("DEFERRABLE INITIALLY DEFERRED", "") # sqlt-diagram doesn't like this syntax for MySQL creation_string = creation_string.replace("AUTOINCREMENT", "") # sqlt-diagram doesn't like this syntax for MySQL creation_string = creation_string.replace("auto_increment", "") # sqlt-diagram doesn't like this syntax for MySQL creation_string = re.sub("COMMENT.*'.*'", "", creation_string, re.DOTALL) # sqlt-diagram doesn't like this syntax for MySQL creation_string = re.sub("CONSTRAINT.*?CHECK.*?,", "", creation_string, re.DOTALL) # sqlt-diagram doesn't like this syntax for MySQL creation_string = re.sub("CONSTRAINT.*?CHECK.*?[)][)]", ")", creation_string, re.DOTALL) # sqlt-diagram doesn't like this syntax for MySQL creation_string = re.sub(" AUTO_INCREMENT=\d+", "", creation_string, re.DOTALL) creation_string = creation_string.replace("''", "") creation_string = creation_string.replace('tg_', 'auth_') db_schema.append(creation_string) except: raise db_schema = '\n\n'.join(db_schema) self.db_schema = db_schema self.mysqldump_schema = self.get_schema(host, user, passwd, db)
def dump_data(prediction_set, outfile): ddG_connection = db_api.ddG() ddGdb = ddgdbapi.ddGDatabase() userdata_set = 'AllValidPGPK' cached_pdb_details = json.loads(read_file('cached_pdb_details.json')) analysis_breakdown = ddG_connection.get_predictionset_data(prediction_set, userdata_set, cached_pdb_details = cached_pdb_details) test_data = dict( amino_acids = analysis_breakdown.amino_acids, pdb_details = analysis_breakdown.pdb_details, predictions = analysis_breakdown.predictions, #single_mutation_GP_predictions = analysis_breakdown.single_mutation_GP_predictions, #single_mutation_no_GP_predictions = analysis_breakdown.single_mutation_no_GP_predictions, #multiple_mutation_predictions = analysis_breakdown.multiple_mutation_predictions, analysis_datasets = analysis_breakdown.analysis_datasets, ) write_file(outfile, json.dumps(test_data))
def retrieve(cls, pdb_id, cache_dir = None): '''Creates a PDB object by using a cached copy of the file if it exists or by retrieving the file from the RCSB.''' # Check to see whether we have a cached copy pdb_id = pdb_id.upper() if cache_dir: filename = os.path.join(cache_dir, "%s.pdb" % pdb_id) if os.path.exists(filename): return cls(read_file(filename)) # Get a copy from the RCSB contents = rcsb.retrieve_pdb(pdb_id) # Create a cached copy if appropriate if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), contents) # Return the object return cls(contents)
def run_r_script(r_script_filename, cwd = '.'): '''This function was adapted from the covariation benchmark.''' p = subprocess.Popen(["R", "CMD", "BATCH", r_script_filename], cwd = cwd) while True: time.sleep(0.3) errcode = p.poll() if errcode != None: break rout = "{0}out".format(r_script_filename) rout_contents = None if os.path.exists(rout): rout_contents = read_file(rout) os.remove(rout) rdata_file = os.path.join(os.path.split(r_script_filename)[0], '.RData') if os.path.exists(rdata_file): os.remove(rdata_file) if errcode != 0: print(rout_contents) raise Exception("The R script failed with error code %d." % errcode) return rout_contents
def import_structures(): setup() ppi_api = get_ppi_api() complex_definitions = json.loads(read_file('tinas_complexes.json')) for tina_pdb_id, complex_structure_definition_pair in sorted(complex_definitions.iteritems()): #if tina_pdb_id != '1WA52': # continue colortext.warning(tina_pdb_id) del complex_structure_definition_pair['Structure']['file_path'] complex_structure_definition_pair['Structure']['pdb_object'] = tina_pdb_objects[tina_pdb_id] pdb_set = ppi_api.add_complex_structure_pair(complex_structure_definition_pair, keywords = ['GSP1'], force = True, trust_database_content = False, allow_missing_params_files = False, debug = False) if pdb_set['success'] == False: print(pdb_set['error']) if 'possible_matches' in pdb_set: for d in pdb_set['possible_matches']: colortext.warning(d['ID']) print('{0}, {1}, {2}'.format(d['LName'].encode('utf-8').strip(), d['LShortName'].encode('utf-8').strip(), d['LHTMLName'].encode('utf-8').strip())) print('{0}, {1}, {2}'.format(d['RName'].encode('utf-8').strip(), d['RShortName'].encode('utf-8').strip(), d['RHTMLName'].encode('utf-8').strip())) create_project_pdb_records()
def _get_XML(self): uparc_xml = None cached_filepath = None if self.cache_dir: cached_filepath = os.path.join(self.cache_dir, '%s.xml' % self.UniParcID) if cached_filepath and os.path.exists(cached_filepath): uparc_xml = read_file(cached_filepath) else: if not self.silent: colortext.write("Retrieving %s\n" % self.UniParcID, "cyan") url = 'http://www.uniprot.org/uniparc/%s.xml' % self.UniParcID uparc_xml = http_get(url) if cached_filepath: write_file(cached_filepath, uparc_xml) self.XML = uparc_xml # Get DOM self._dom = parseString(uparc_xml) main_tags = self._dom.getElementsByTagName("uniparc") assert(len(main_tags) == 1) entry_tags = main_tags[0].getElementsByTagName("entry") assert(len(entry_tags) == 1) self.entry_tag = entry_tags[0]
def test_abacus_graph(): '''This function can be deleted. It was added to test the abacus graph with different numbers of datapoints.''' import os import json if not(os.path.exists('results_cache.txt')): results = ddG_connection.get_flattened_prediction_results('FPP biosensor: protocol 16') for r in results: r['TimeTaken'] = r['TimeTaken'].total_seconds() # timedelta objects are not JSON serializable write_file('results_cache.txt', json.dumps(results), 'w') results = json.loads(read_file('results_cache.txt')) try: ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_3.png', cached_results = results, num_datapoints = 3) except: pass ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_5.png', cached_results = results, num_datapoints = 5) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_8.png', cached_results = results, num_datapoints = 8) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_10.png', cached_results = results, num_datapoints = 10) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_12.png', cached_results = results, num_datapoints = 12) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_20.png', cached_results = results, num_datapoints = 20) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_50.png', cached_results = results, num_datapoints = 50) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_127.png', cached_results = results, num_datapoints = 127) ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_255.png', cached_results = results, num_datapoints = 255)
def run(self): # Create input files self._create_temp_directory() self._create_input_files() self._create_script() write_file(self._filepath('script.pml'), self.script) # Run PyMOL #colortext.message(self.visualization_pymol +' -c ' + self._filepath('script.pml')) po = tprocess.Popen(self.outdir, [self.visualization_pymol, '-c', self._filepath('script.pml')]) #colortext.message(po.stdout) #colortext.warning(po.errorcode) #colortext.error(po.stderr) self.stdout = po.stdout self.stderr = po.stderr self.return_code = po.errorcode if self.return_code != 0: raise Exception('Error: %s' % str(self.stderr)) pse_path = self._filepath('session.pse') if os.path.exists(pse_path): self.PSE = read_file(pse_path, binary = True)
def from_file(pymol_name, pdb_filename, residues_of_interest = []): return PDBContainer(pymol_name, read_file(pdb_filename), residues_of_interest)
from klab.bio.basics import ChainMutation from klab.fs.fsio import read_file, write_temp_file, open_temp_file, write_file from klab.bio.pfam import Pfam from klab.bio.dssp import MonomerDSSP, ComplexDSSP, MissingAtomException from klab.bio.ligand import Ligand, PDBLigand from klab.bio.pdbtm import PDBTM from klab.db.sqlalchemy_interface import get_single_record_from_query, get_or_create_in_transaction from kddg.api.schema import test_schema_against_database_instance from kddg.api.schema import PDBFile, PDBChain, PDBMolecule, PDBMoleculeChain, PDBResidue, LigandDescriptor, LigandIdentifier, LigandSynonym, PDBLigand from kddg.api.schema import Ligand as DBLigand #from kddg.api.schema import Publication, PublicationAuthor, PublicationIdentifier from kddg.api.layers import * from kddg.api.db import ddG, PartialDataException, SanityCheckException import kddg.api.dbi as dbi rosetta_scripts_path = '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease' rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database' p = PDB(read_file('/kortemmelab/data/kyleb/ddg_numbering_for_shane/24548-data/1CBW_FGHI.pdb')) #p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path) p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path, extra_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res') pprint.pprint(p.get_atom_sequence_to_rosetta_map()) pprint.pprint(p.rosetta_sequences) from kddg.api.ppi import get_interface as get_ppi_interface ppi_api = get_ppi_interface(read_file('../misc/ddgdb.pw'), rosetta_scripts_path = '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease', rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database') content = ppi_api.DDG_db.execute_select('SELECT Content FROM PDBFile WHERE ID="1CBW"')[0]['Content'] print(content) write_file('/tmp/ddginterface/1CBW_FGHI_db.pdb', content)
def prepare_structures(file_filter, output_directory, loop_definitions, require_filter=True, create_partial_structures=False, expected_min_loop_length=None, expected_max_loop_length=None, remove_hetatm=False): search_radius = 10.0 if not (os.path.exists(output_directory)): os.mkdir(output_directory) # Iterate through the dataset cases for pdb_file in sorted(glob.glob(file_filter)): pdb_prefix = os.path.splitext(os.path.split(pdb_file)[1])[0].lower() # Read the benchmark loop definition if not loop_definitions.get(pdb_prefix): raise Exception( 'The loop definition for {0} is missing.'.format(pdb_prefix)) loop_definition = loop_definitions[pdb_prefix] loops = [ PDBSection(loop_definition['chainID'], loop_definition['StartResidueID'], loop_definition['EndResidueID'], Sequence=loop_definition['Sequence']) ] # Only process files that passed the benchmark criteria if require_filter and not loop_definition['PassedFilter']: continue # Read in the PDB content, removing HETATM lines if requested pdb_content = read_file(pdb_file) if remove_hetatm: new_content = [] for l in pdb_content.split('\n'): if not l.startswith('HETATM'): new_content.append(l) pdb_content = '\n'.join(new_content) # Remove the loops and surrounding sidechain atoms from the structure b = Bonsai(pdb_content) bonsai, cutting, PSE_file, PSE_script, FASTA_file = b.prune_loop_for_kic( loops, search_radius, expected_min_loop_length=expected_min_loop_length, expected_max_loop_length=expected_max_loop_length, generate_pymol_session=True) # Create a PyMOL session file for visual inspection write_file( os.path.join(output_directory, '{0}.pse'.format(pdb_prefix)), PSE_file) # Create the new PDB file with the loop and surrounding sidechains removed write_file( os.path.join(output_directory, '{0}.pdb'.format(pdb_prefix)), bonsai) if create_partial_structures: write_file( os.path.join( output_directory, '{0}_missing_loop_and_surrounding_sidechains.pdb'.format( pdb_prefix)), bonsai) write_file( os.path.join( output_directory, '{0}_loop_and_surrounding_sidechains.pdb'.format( pdb_prefix)), cutting) # Create the FASTA file containing the loop sequence. This will be used along with the loop_definitions.json file # to add the residues back into the Rosetta structure write_file( os.path.join(output_directory, '{0}.fasta'.format(pdb_prefix)), FASTA_file) sys.stdout.write('.') sys.stdout.flush() print('')
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False): '''This is the main function in this script and is where the basic analysis is compiled. output_directory should contain the results of the prediction run. data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details expectn specifies how many predictions we expect to find (useful in case some jobs failed). top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g. the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures. prefix is used to name the output files. ''' # Sanity check assert(top_x <= expectn) # Set up reference structures structures_folder = os.path.join('..', 'input', 'structures', '12_res') rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference') rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference') # Set up the per-case statistics dicts best_scoring_structures = {} median_scoring_structures = {} worst_scoring_structures = {} total_percent_subanstrom = {} top_x_percent_subanstrom = {} top_x_loop_prediction_sets = {} # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over # varying values of X used to select the TopX structures percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX'] percent_subangrom_by_top_x = {} # Set up the summary analysis file csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])] # Read in the benchmark input pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()] # Truncate the benchmark input for test mode if test_mode: pdb_ids = pdb_ids[:10] # Analyze the performance for each case in the benchmark for pdb_id in pdb_ids: rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb') assert(os.path.exists(rcsb_reference_pdb)) rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb') assert(os.path.exists(rosetta_reference_pdb)) assert(len(pdb_id) == 4) loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id)) loop_sets = json.loads(read_file(loops_file)) assert(len(loop_sets['LoopSet']) == 1) # Create a container for loop predictions loop_prediction_set = LoopPredictionSet() # Read the coordinates from the reference PDB file rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id)) details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode) for d in details: loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix']) print(' Done') # Compute the RMSD for this case for the structure using the pandas dataframe # It is more efficient to do this after truncation if truncating by score but in the general case users will # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen # in the loop above colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id)) loop_prediction_set.compute_rmsds(rcsb_reference_matrix) loop_prediction_set.check_rmsds(rosetta_reference_matrix) print(' Done\n') # Truncate the structures to the top expectn-scoring files loop_prediction_set.sort_by_score() loop_prediction_set.truncate(expectn) if len(loop_prediction_set) != expectn: print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set))) sys.exit(1) # Create a new set containing the top-X-scoring structures and identify the median-scoring structure top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x] median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)] # Determine the lowest-/best-scoring structure best_scoring_structures[pdb_id] = loop_prediction_set[0] best_score = best_scoring_structures[pdb_id].score worst_scoring_structures[pdb_id] = loop_prediction_set[-1] worst_score = worst_scoring_structures[pdb_id].score assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id]) # Print structures colortext.warning('Top{0} structures'.format(top_x)) print(top_x_loop_prediction_sets[pdb_id]) colortext.warning('Top1 structure') print(best_scoring_structures[pdb_id]) colortext.warning('Median (by score) structure') print(median_scoring_structures[pdb_id]) colortext.warning('Lowest-scoring structures') print(worst_scoring_structures[pdb_id]) # Create values for TopX variable plot loop_prediction_set.sort_by_score() for top_x_var in range(1, len(loop_prediction_set) + 1): new_subset = loop_prediction_set[:top_x_var] percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0) percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom)) percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {}) percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0) top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0) colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id])) colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id])) loop_prediction_set.sort_by_rmsd() closest_rmsd = loop_prediction_set[0].rmsd closest_score = loop_prediction_set[0].score colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd)) colortext.warning('Score of closest model: {0}'.format(closest_score)) top_1_rmsd = best_scoring_structures[pdb_id].rmsd top_x_rmsd = best_scoring_structures[pdb_id].rmsd top_x_score = best_scoring_structures[pdb_id].score for s in top_x_loop_prediction_sets[pdb_id]: if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score): top_x_rmsd = s.rmsd top_x_score = s.score assert(top_x_score <= worst_score) assert(top_x_rmsd <= top_1_rmsd) print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd)) print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd)) csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd]))) # Add a column of median percent subangstrom values for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()): assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids)) median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2] percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value)) write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file)) write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file)) write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
# Iterate through the dataset cases colortext.message( 'Adding loop residues back to the pruned structures in {0}.'. format(pruned_structure_directory)) file_filter = os.path.join(pruned_structure_directory, '*.pdb') for pdb_file in sorted(glob.glob(file_filter)): pdb_prefix = os.path.splitext( os.path.split(pdb_file)[1])[0].lower() file_prefix = os.path.splitext(pdb_file)[0] fasta_file = file_prefix + '.fasta' loop_file = file_prefix + '.loop.json' assert (os.path.exists(fasta_file)) assert (os.path.exists(loop_file)) # Convert the FASTA headers back into PDB residue IDs fasta_contents = read_file(fasta_file) headers = [ l for l in fasta_contents.split('\n') if l.startswith('>') ] assert (len(headers) == 1) header = headers[0] pdb_residue_ids = [ PDB.ChainResidueID2String(l[0], l[1:]) for l in header[header.find('Residues ') + 9:].split(';') ] # Add the missing atoms atoms back into the PDB file spackler = Spackler.from_filepath(pdb_file) new_pdb_content = spackler.add_backbone_atoms_linearly_from_loop_filepaths( loop_file, fasta_file, pdb_residue_ids) write_file(