def match(self, other): ''' This is a noisy terminal-printing function at present since there is no need to make it a proper API function.''' colortext.message("FASTA Match") for frompdbID, fromchains in sorted(self.iteritems()): matched_pdbs = {} matched_chains = {} for fromchain, fromsequence in fromchains.iteritems(): for topdbID, tochains in other.iteritems(): for tochain, tosequence in tochains.iteritems(): if fromsequence == tosequence: matched_pdbs[topdbID] = matched_pdbs.get(topdbID, set()) matched_pdbs[topdbID].add(fromchain) matched_chains[fromchain] = matched_chains.get(fromchain, []) matched_chains[fromchain].append((topdbID, tochain)) foundmatches = [] colortext.printf(" %s" % frompdbID, color="silver") for mpdbID, mchains in matched_pdbs.iteritems(): if mchains == set(fromchains.keys()): foundmatches.append(mpdbID) colortext.printf(" PDB %s matched PDB %s on all chains" % (mpdbID, frompdbID), color="white") if foundmatches: for fromchain, fromsequence in fromchains.iteritems(): colortext.printf(" %s" % (fromchain), color = "silver") colortext.printf(" %s" % (fromsequence), color = self.unique_sequences[fromsequence]) mstr = [] for mchain in matched_chains[fromchain]: if mchain[0] in foundmatches: mstr.append("%s chain %s" % (mchain[0], mchain[1])) colortext.printf(" Matches: %s" % ", ".join(mstr)) else: colortext.error(" No matches found.")
def runLizsSet(PredictionSet, ProtocolID): raise colortext.Exception("Do you really want to run this?") colortext.printf("\nAdding Liz's data set to %s prediction set." % PredictionSet, "lightgreen") KeepHETATMLines = False FilterTester.openDB() # Filter by the DummySource set of experiments er1 = ExperimentResultSet(ddGdb) ef1 = ExperimentFilter() ef1.setSource(ExperimentFilter.LizKellogg) er1.addFilter(ef1) FilterTester.test(er1) experimentIDs = sorted(list(er1.getFilteredIDs())) colortext.message("\nThe number of unique experiments is %d.\n" % len(experimentIDs)) ddG_connection = db.ddG() count = 0 for experimentID in experimentIDs: ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True) count += 1 if count >= 10: colortext.write(".") colortext.flush() count = 0 print("")
def addLinsJobs(PredictionSet, ProtocolID): raise colortext.Exception("Do you really want to run this?") colortext.printf("\nAdding Lin's mutations to %s prediction set." % PredictionSet, "lightgreen") KeepHETATMLines = False FilterTester.openDB() # Filter by the DummySource set of experiments er1 = ExperimentResultSet(ddGdb) ef1 = ExperimentFilter() ef1.setSource(ExperimentFilter.DummySource) er1.addFilter(ef1) # Filter by the particular PDB sr = StructureResultSet(ddGdb, 'WHERE PDB_ID="3K0NB_lin"') er1 = ExperimentResultSet.fromIDs(ddGdb, er1.getFilteredIDs()).filterBySet(sr) FilterTester.test(er1) experimentIDs = sorted(list(er1.getFilteredIDs())) colortext.message("\nThe number of unique experiments is %d.\n" % len(experimentIDs)) ddG_connection = db.ddG() count = 0 for experimentID in experimentIDs: ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True) count += 1 if count >= 10: colortext.write(".") colortext.flush() count = 0 print("")
def showAllEligibleProTherm(PredictionSet, ProtocolID, KeepHETATMLines): #inserter = JobInserter() colortext.printf("\nAdding ProTherm mutations to %s prediction set." % PredictionSet, "lightgreen") #ddGdb = dbi.ddGDatabase() MAX_RESOLUTION = 2.1 MAX_NUMRES_PROTHERM = 350 MAX_STANDARD_DEVIATION = 1.0 FilterTester.openDB() if False: t1 = time.time() er1 = ExperimentResultSet(ddGdb) er1.addFilter(ExperimentFilter.OnSource(ExperimentFilter.ProTherm)) er1.addFilter(ExperimentFilter.NumberOfMutations(1, 1)) er1.addFilter(ExperimentFilter.NumberOfChains(1, 1)) er1.addFilter(ExperimentFilter.StandardDeviation(None, MAX_STANDARD_DEVIATION)) er1.addFilter(StructureFilter.Resolution(None, MAX_RESOLUTION)) er1.addFilter(StructureFilter.Techniques(StructureFilter.XRay)) FilterTester.test(er1) t2 = time.time() print(t2 - t1) # This method usually takes around 65% of the time as the method above t1 = time.time() ef1 = ExperimentFilter() ef1.setSource(ExperimentFilter.ProTherm) er1 = ExperimentResultSet(ddGdb) er1.addFilter(ExperimentFilter.OnSource(ExperimentFilter.ProTherm)) FilterTester.test(er1) ef1.setNumberOfMutations(1, 1) ef1.setNumberOfChains(1, 1) ef1.setStandardDeviation(None, MAX_STANDARD_DEVIATION) sf1 = StructureFilter() sf1.setResolution(None, MAX_RESOLUTION) sf1.setTechniques(StructureFilter.XRay) er1 = ExperimentResultSet(ddGdb) er1.addFilter(ef1) er1.addFilter(sf1) FilterTester.test(er1) t2 = time.time() print(t2 - t1) experimentIDs = sorted(list(er1.getFilteredIDs())) colortext.message("\nThe number of unique ProTherm experiments with:\n\t- one mutation;\n\t- structures solved by X-ray diffraction and with <= %d residues;\n\t- a maximum standard deviation in experimental results of <= %0.2f;\n\t- and a resolution of <= %0.2f Angstroms.\nis %d.\n" % (MAX_NUMRES_PROTHERM, MAX_STANDARD_DEVIATION, MAX_RESOLUTION, len(experimentIDs))) ddG_connection = db.ddG() count = 0 sys.exit(0) print("") for experimentID in experimentIDs: ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True) count += 1 if count >= 10: colortext.write(".") colortext.flush() count = 0 print("")
def plot(self, table_name, RFunction, output_filename = None, filetype = "pdf"): '''Results is expect to be a list of dicts each of which has the keys ExperimentID and ddG.''' if (not self.analysis_tables) or (not table_name): raise Exception("There are no analysis tables to plot.") if not table_name in self.analysis_tables.keys(): raise Exception("The analysis table '%s' does not exist." % table_name) R_return_values = {} gplot = None analysis_table = self.analysis_tables[table_name] if self.quiet_level >= 3: print(table_name) print(RFunction) if len(analysis_table.points) == 1: raise Exception("The analysis table %s set only has one data point. At least two points are required." % table_name) else: inputfname = self.CreateCSVFile(table_name) if self.quiet_level >= 3: print(inputfname) try: if self.quiet_level >= 2: colortext.printf("Running %s." % RFunction) if output_filename: colortext.printf("Saving graph as %s with filename %s." % (filetype, output_filename)) output_fname = output_filename if not output_fname: output_fname = rosettahelper.writeTempFile(".", "") R_output = RFunction(inputfname, output_fname, filetype) R_return_values = RUtilities.parse_R_output(R_output) colortext.message(table_name) print(" %s" % str(RFunction)) for k, v in sorted(R_return_values.iteritems()): print(" %s: %s" % (str(k), str(v))) if not output_filename: contents = rosettahelper.readBinaryFile(output_fname) delete_file(output_fname) description = None for file_suffix, details in RFunctions.iteritems(): if details[1] == RFunction: description = details[0] assert(description) gplot = AnalysisObject(table_name, description, filetype, contents) else: gplot = output_filename except Exception, e: import traceback colortext.error(traceback.format_exc()) delete_file(inputfname) raise Exception(e) delete_file(inputfname)
def retrieve_file_from_EBI(resource, silent = True): '''Retrieve a file from the RCSB.''' #import sys #import traceback #print(resource) #print('\n'.join(traceback.format_stack())) #sys.exit(0) if not silent: colortext.printf("Retrieving %s from EBI" % os.path.split(resource)[1], color = "aqua") attempts = 10 while attempts > 0: try: return get_insecure_resource("ftp.ebi.ac.uk", resource) except: print('FAILED, RETRYING') attempts -= 1 time.sleep(3)
def retrieve_data_from_rcsb(cls, ligand_code, pdb_id = None, silent = True, cached_dir = None): '''Retrieve a file from the RCSB.''' if not silent: colortext.printf("Retrieving data from RCSB") if cached_dir: assert(os.path.exists(cached_dir)) ligand_info_path, ligand_info, pdb_ligand_info, pdb_ligand_info_path = None, None, None, None if cached_dir: ligand_info_path = os.path.join(cached_dir, '{0}.cif'.format(ligand_code)) if os.path.exists(ligand_info_path): ligand_info = read_file(ligand_info_path) if not ligand_info: ligand_info = retrieve_ligand_cif(ligand_code) if cached_dir: write_file(ligand_info_path, ligand_info) # Parse .cif l = cls(ligand_code) l.parse_cif(ligand_info) l.pdb_id = pdb_id or l.pdb_id has_pdb_id = l.pdb_id and (len(l.pdb_id) == 4) and (l.pdb_id != '?') # the last case is unnecessary and will be short-cut but I included it to show possible values # Parse PDB XML if has_pdb_id: if cached_dir: pdb_ligand_info_path = os.path.join(cached_dir, '{0}.pdb.ligandinfo'.format(l.pdb_id.lower())) if os.path.exists(pdb_ligand_info_path): pdb_ligand_info = read_file(pdb_ligand_info_path) else: pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id) write_file(pdb_ligand_info_path, pdb_ligand_info) else: pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id) if pdb_ligand_info: l.parse_pdb_ligand_info(pdb_ligand_info) # Retrive the diagram image l.get_diagram() return l
def test_pdbml_speed(): test_cases = [ '1WSY', '1YGV', '487D', '1HIO', '1H38', '3ZKB', ] for test_case in test_cases: print("\n") colortext.message("Creating PDBML object for %s" % test_case) #PDBML.retrieve(test_case, cache_dir = cache_dir) print("") colortext.printf("Using the old minidom class", color = 'cyan') t1 = time.clock() p_minidom = PDBML_slow.retrieve(test_case, cache_dir = cache_dir) t2 = time.clock() colortext.message("Done in %0.2fs!" % (t2 - t1)) print("") colortext.printf("Using the new sax class", color = 'cyan') t1 = time.clock() p_sax = PDBML.retrieve(test_case, cache_dir = cache_dir) t2 = time.clock() colortext.message("Done in %0.2fs!" % (t2 - t1)) colortext.write("\nEquality test: ", color = 'cyan') try: assert(p_minidom.atom_to_seqres_sequence_maps.keys() == p_sax.atom_to_seqres_sequence_maps.keys()) for c, s_1 in p_minidom.atom_to_seqres_sequence_maps.iteritems(): s_2 = p_sax.atom_to_seqres_sequence_maps[c] assert(str(s_1) == str(s_2)) colortext.message("passed\n") except: colortext.error("failed\n")
def _print_lines(helplines): for linepair in helplines: colortext.printf(linepair[0], color=linepair[1])
def test_ddg_pdb_ids(): # Test set - 845 PDB IDs. A small number required manual intervention but most are parsed and mapped automatically. 5 needed to use the SIFTS mappings. ddG_pdb_ids = ['107L','108L','109L','110L','111L','112L','113L','114L','115L','118L','119L','120L','122L','123L','125L','126L','127L','128L','129L','130L','131L','137L','149L','150L','151L','160L','161L','162L','163L','164L','165L','168L','169L','171L','172L','173L','190L','191L','192L','195L','196L','1A23','1A2I','1A2P','1A3Y','1A43','1A4Y','1A53','1A5E','1A70','1A7A','1A7H','1A7V','1AAL','1AAR','1AAZ','1ABE','1ACB','1ADO','1ADW','1AG2','1AG4','1AG6','1AIE','1AIN','1AJ3','1AJQ','1AKK','1AKM','1AM7','1AMQ','1ANF','1ANK','1ANT','1AO6','1AON','1AOZ','1APC','1APL','1APS','1AQH','1AR1','1ARR','1ATJ','1ATN','1AU1','1AUT','1AV1','1AVR','1AX1','1AXB','1AYE','1AYF','1AZP','1B0O','1B26','1B5M','1B8J','1BAH','1BAN','1BAO','1BCX','1BD8','1BET','1BF4','1BFM','1BGD','1BGL','1BJP','1BKE','1BKS','1BLC','1BMC','1BNI','1BNL','1BNS','1BNZ','1BOY','1BP2','1BPI','1BPL','1BPR','1BPT','1BRF','1BRG','1BRH','1BRI','1BRJ','1BRK','1BSA','1BSB','1BSC','1BSD','1BSE','1BSR','1BTA','1BTI','1BTM','1BUJ','1BVC','1BVU','1BZO','1C0L','1C17','1C2R','1C52','1C53','1C5G','1C6P','1C9O','1CAH','1CBW','1CDC','1CEA','1CEY','1CHK','1CHO','1CHP','1CLW','1CM7','1CMB','1CMS','1COA','1COK','1COL','1CPM','1CSP','1CTS','1CUN','1CUS','1CVW','1CX1','1CX8','1CYC','1CYO','1D0X','1D1G','1DAQ','1DDN','1DE3','1DEC','1DEQ','1DFO','1DFX','1DHN','1DIL','1DIV','1DJU','1DKG','1DKT','1DLC','1DM0','1DO9','1DPM','1DTD','1DTO','1DVC','1DVF','1DVV','1DXX','1DYA','1DYB','1DYC','1DYD','1DYE','1DYF','1DYG','1DYJ','1E21','1E6K','1E6L','1E6M','1E6N','1EDH','1EFC','1EG1','1EHK','1EKG','1EL1','1ELV','1EMV','1EQ1','1ERU','1ESF','1ETE','1EVQ','1EW4','1EXG','1EZA','1F88','1FAJ','1FAN','1FC1','1FEP','1FGA','1FKB','1FKJ','1FLV','1FMK','1FMM','1FNF','1FR2','1FRD','1FTG','1FTT','1FXA','1G6N','1G6V','1G6W','1GA0','1GAD','1GAL','1GAY','1GAZ','1GB0','1GB2','1GB3','1GB7','1GBX','1GD1','1GF8','1GF9','1GFA','1GFE','1GFG','1GFH','1GFJ','1GFK','1GFL','1GFR','1GFT','1GFU','1GFV','1GKG','1GLH','1GLM','1GOB','1GPC','1GQ2','1GRL','1GRX','1GSD','1GTM','1GTX','1GUY','1GXE','1H09','1H0C','1H2I','1H7M','1H8V','1HA4','1HCD','1HEM','1HEN','1HEO','1HEP','1HEQ','1HER','1HEV','1HFY','1HFZ','1HGH','1HGU','1HIB','1HIC','1HIO','1HIX','1HK0','1HME','1HML','1HNG','1HNL','1HOR','1HQK','1HTI','1HUE','1HXN','1HYN','1HYW','1HZ6','1I4N','1I5T','1IAR','1IC2','1IDS','1IFB','1IFC','1IGS','1IGV','1IHB','1IMQ','1INQ','1INU','1IO2','1IOB','1IOF','1IOJ','1IR3','1IRL','1IRO','1ISK','1IX0','1J0X','1J4S','1J7N','1JAE','1JBK','1JHN','1JIW','1JJI','1JKB','1JNK','1JTD','1JTG','1JTK','1K23','1K3B','1K40','1K9Q','1KA6','1KBP','1KDN','1KDU','1KDX','1KEV','1KFD','1KFW','1KJ1','1KKJ','1KTQ','1KUM','1KVA','1KVB','1KVC','1L00','1L02','1L03','1L04','1L05','1L06','1L07','1L08','1L09','1L10','1L11','1L12','1L13','1L14','1L15','1L16','1L17','1L18','1L19','1L20','1L21','1L22','1L23','1L24','1L33','1L34','1L36','1L37','1L38','1L40','1L41','1L42','1L43','1L44','1L45','1L46','1L47','1L48','1L49','1L50','1L51','1L52','1L53','1L54','1L55','1L56','1L57','1L59','1L60','1L61','1L62','1L63','1L65','1L66','1L67','1L68','1L69','1L70','1L71','1L72','1L73','1L74','1L75','1L76','1L77','1L85','1L86','1L87','1L88','1L89','1L90','1L91','1L92','1L93','1L94','1L95','1L96','1L97','1L98','1L99','1LAV','1LAW','1LBI','1LFO','1LHH','1LHI','1LHJ','1LHK','1LHL','1LHM','1LHP','1LLI','1LMB','1LOZ','1LPS','1LRA','1LRE','1LRP','1LS4','1LSN','1LUC','1LVE','1LYE','1LYF','1LYG','1LYH','1LYI','1LYJ','1LZ1','1M7T','1MAX','1MBD','1MBG','1MCP','1MGR','1MJC','1MLD','1MSI','1MUL','1MX2','1MX4','1MX6','1MYK','1MYL','1N02','1N0J','1NAG','1NM1','1NZI','1OA2','1OA3','1OCC','1OH0','1OIA','1OKI','1OLR','1OMU','1ONC','1OPD','1ORC','1OSA','1OSI','1OTR','1OUA','1OUB','1OUC','1OUD','1OUE','1OUF','1OUG','1OUH','1OUI','1OUJ','1OVA','1P2M','1P2N','1P2O','1P2P','1P2Q','1P3J','1PAH','1PBA','1PCA','1PDO','1PGA','1PHP','1PII','1PIN','1PK2','1PMC','1POH','1PPI','1PPN','1PPP','1PQN','1PRE','1PRR','1Q5Y','1QEZ','1QGV','1QHE','1QJP','1QK1','1QLP','1QLX','1QM4','1QND','1QQR','1QQV','1QT6','1QT7','1QU0','1QU7','1QUW','1R2R','1RBN','1RBP','1RBR','1RBT','1RBU','1RBV','1RCB','1RDA','1RDB','1RDC','1REX','1RGC','1RGG','1RH1','1RHD','1RHG','1RIL','1RIS','1RN1','1ROP','1RRO','1RTB','1RTP','1RX4','1S0W','1SAK','1SAP','1SCE','1SEE','1SFP','1SHF','1SHG','1SHK','1SMD','1SPD','1SPH','1SSO','1STF','1STN','1SUP','1SYC','1SYD','1SYE','1SYG','1T3A','1T7C','1T8L','1T8M','1T8N','1T8O','1TBR','1TCA','1TCY','1TEN','1TFE','1TGN','1THQ','1TI5','1TIN','1TIT','1TLA','1TML','1TMY','1TOF','1TPE','1TPK','1TTG','1TUP','1TUR','1U5P','1UBQ','1UCU','1UOX','1URK','1UW3','1UWO','1UZC','1V6S','1VAR','1VFB','1VIE','1VQA','1VQB','1VQC','1VQD','1VQE','1VQF','1VQG','1VQH','1VQI','1VQJ','1W3D','1W4E','1W4H','1W99','1WIT','1WLG','1WPW','1WQ5','1WQM','1WQN','1WQO','1WQP','1WQQ','1WQR','1WRP','1WSY','1XAS','1XY1','1Y4Y','1Y51','1YAL','1YAM','1YAN','1YAO','1YAP','1YAQ','1YCC','1YEA','1YGV','1YHB','1YMB','1YNR','1YPA','1YPB','1YPC','1YPI','1Z1I','1ZNJ','200L','206L','216L','217L','219L','221L','224L','227L','230L','232L','233L','235L','236L','237L','238L','239L','240L','241L','242L','243L','244L','246L','247L','253L','254L','255L','2A01','2A36','2ABD','2AC0','2ACE','2ACY','2ADA','2AFG','2AIT','2AKY','2ASI','2ATC','2B4Z','2BBM','2BQA','2BQB','2BQC','2BQD','2BQE','2BQF','2BQG','2BQH','2BQI','2BQJ','2BQK','2BQM','2BQN','2BQO','2BRD','2CBR','2CHF','2CI2','2CPP','2CRK','2CRO','2DQJ','2DRI','2EQL','2FAL','2FHA','2FX5','2G3P','2GA5','2GSR','2GZI','2HEA','2HEB','2HEC','2HED','2HEE','2HEF','2HIP','2HMB','2HPR','2IFB','2IMM','2L3Y','2L78','2LZM','2MBP','2MLT','2NUL','2OCJ','2PDD','2PEC','2PEL','2PRD','2Q98','2RBI','2RN2','2RN4','2SNM','2SOD','2TMA','2TRT','2TRX','2TS1','2WSY','2ZAJ','2ZTA','3BCI','3BCK','3BD2','3BLS','3CHY','3D2A','3ECA','3FIS','3HHR','3K0NA_lin','3K0NB_lin','3K0On_lin','3MBP','3PGK','3PRO','3PSG','3SSI','3TIM','3VUB','451C','487D','4BLM','4CPA','4GCR','4LYZ','4SGB','4TLN','4TMS','5AZU','5CPV','5CRO','5MDH','5PEP','6TAA','7AHL','7PTI','8PTI','8TIM','9INS','9PCY',] print(len(ddG_pdb_ids)) fix_later = set([ # SELECT * FROM `Experiment` WHERE `PDBFileID` IN ('1OLR') # SELECT * FROM `DataSetDDG` WHERE `PDBFileID` IN ('1OLR') # SELECT * FROM `UserDataSetExperiment` WHERE `PDBFileID` IN ('1OLR') # SELECT * FROM `UserAnalysisSet` WHERE `PDB_ID` IN ('1OLR') ]) failed_cases = [] specific_cut_offs = { '1AR1' : (78, 76, 73.00), # Chain C has a Clustal Omega match at 77% '1BF4' : (80, 77, 87.00), # Chain A has a Clustal Omega match at 79% '1MCP' : (100, 98, 50.00), # Chain H has a Clustal Omega match at 100% but only half the chain '2ZAJ' : (75, 72, 70.00), # '1CPM' : (73, 71, 70.00), # } to_be_hardcoded = { # Special case: 1N02. This needs to be handled manually. # DBREF 1N02 A 1 3 UNP P81180 CVN_NOSEL 1 3 # DBREF 1N02 A 4 49 UNP P81180 CVN_NOSEL 54 992IMM # DBREF 1N02 A 50 54 UNP P81180 CVN_NOSEL 49 53 # DBREF 1N02 A 55 99 UNP P81180 CVN_NOSEL 4 48 # DBREF 1N02 A 100 101 UNP P81180 CVN_NOSEL 100 101 '1N02', ('2IMM'), # No PDB <-> UniProt mapping } test_these = [ '1KJ1' ] colortext.message('Testing %d PDB files for the DDG database.' % len(ddG_pdb_ids)) #start_x = 0 start_x = 0 for x in range(start_x, len(ddG_pdb_ids)): ddG_pdb_id = ddG_pdb_ids[x] if test_these and ddG_pdb_id not in test_these: continue if ddG_pdb_id not in fix_later: colortext.warning('Testing PDB file number %d: %s' % (x, ddG_pdb_id)) starting_clustal_cut_off = 100 min_clustal_cut_off = 71 acceptable_sequence_percentage_match = 80.0 if specific_cut_offs.get(ddG_pdb_id): starting_clustal_cut_off, min_clustal_cut_off, acceptable_sequence_percentage_match = specific_cut_offs[ddG_pdb_id] try: rr = ResidueRelatrix(ddG_pdb_id, rosetta_scripts_path, rosetta_database_path, starting_clustal_cut_off = starting_clustal_cut_off, min_clustal_cut_off = min_clustal_cut_off, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = '/home/oconchus/temp') except SpecificException: failed_cases.append((x, ddG_pdb_id, str(e))) else: colortext.warning('SKIPPING PDB file number %d: %s' % (x, ddG_pdb_id)) if failed_cases: colortext.error('Failed cases:') fcc = 0 for f in failed_cases: if fcc == 0: colortext.warning(str(f)) else: colortext.printf(str(f), color = 'cyan') fcc = (fcc + 1) % 2 print("failed_cases", failed_cases)
shutil.rmtree(tmpdir) total_time_in_secs += t.sum() average_time_taken = float(total_time_in_secs)/float(cases_computed or 1) estimate_remaining_time = number_of_cases_left * average_time_taken t.stop() colortext.printf("**Profile**", 'orange') print(t) colortext.message("Time taken for this case: %0.2fs." % t.sum()) colortext.message("Average time taken per case: %0.2fs." % average_time_taken) colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60)) print("\n") #exF.close() colortext.printf("\nDone.", 'lightgreen') if failed_cases: colortext.error("Failed cases:\n[%s]" % ",".join(map(str, failed_cases))) #main(FixedIDs = [38766, 39738, 40379, 40381] + range(40610, 40611)) #main(FixedIDs = [39044]) #main(FixedIDs = [48898,49870,50948,51058,51059,52247,53633,53711]) convert_scores_to_json() print('here') FixedIDs = [76633] FixedIDs = [] main(FixedIDs = FixedIDs, radii = [8.0])
def classify_failures(prediction_set): ddGdb = ddgdbapi.ddGDatabase() results_root = '/kortemmelab/shared/DDG/jobs' UserDataSetExperimentIDs = {} results = ddGdb.execute_select('''SELECT ID, ExperimentID FROM Prediction WHERE PredictionSet = %s AND STATUS = 'failed' ''', parameters=(prediction_set,)) reported_failures = [r['ID'] for r in results] for r in results: UserDataSetExperimentIDs[r['ID']] = r['ExperimentID'] actually_failed = [] did_not_fail = [] for PredictionID in reported_failures: zipfile_path = os.path.join(results_root, '%d.zip' % PredictionID) #try: z = zipfile.ZipFile(zipfile_path, 'r') #except: # colortext.error('MISSING FILE FOR %d' % PredictionID) # continue file_list = z.namelist() found_stdout = 0 found_stderr = 0 for f in file_list: if f.find('.cmd.o') != -1: found_stdout = 1 elif f.find('.cmd.e') != -1: found_stderr = 1 assert(found_stdout >= found_stderr) if found_stderr: assert(found_stderr == 1) colortext.error("Job #%d actually failed" % PredictionID) actually_failed.append(PredictionID) else: colortext.warning("Job #%d had not failed by the time it was terminated." % PredictionID) did_not_fail.append(PredictionID) colortext.message("*** Report ***") print('%d jobs were marked as failed.' % len(reported_failures)) colortext.warning('%d jobs were marked as failed but had not failed.' % len(did_not_fail)) colortext.error('%d jobs were marked as failed and did fail.\n' % len(actually_failed)) pdb_details = {} failed_job_pdb_files = {} for failed_job in actually_failed: PDBFileID = ddGdb.execute_select("SELECT UserDataSetExperiment.PDBFileID AS PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s", parameters=(failed_job,))[0]['PDBFileID'] pdb_details[PDBFileID] = True failed_job_pdb_files[failed_job] = PDBFileID for pdb_id in pdb_details.keys(): pdb_details[pdb_id] = ddGdb.execute_select("SELECT Resolution, Techniques FROM PDBFile WHERE ID=%s", parameters=(pdb_id,))[0] pdb_details[pdb_id]['Chains'] = [r['Chain'] for r in ddGdb.execute_select("SELECT Chain FROM PDBChain WHERE PDBFileID=%s ORDER BY Chain", parameters=(pdb_id,))] pdb_details[pdb_id]['TotalJobs'] = ddGdb.execute_select("SELECT Count(ID) AS TotalJobs FROM UserDataSetExperiment WHERE PDBFileID=%s", parameters=(pdb_id,))[0]['TotalJobs'] hosts = {} failed_by_hessin = {} failed_by_residue_mismatch = {} failed_for_another_reason = {} missing_output = {} mutfiles = {} count = 1 for failed_job in actually_failed: mutfile = None colortext.message('Failed job %d of %d' % (count, failed_job)) zipfile_path = os.path.join(results_root, '%d.zip' % failed_job) found_output = False pdb_id = failed_job_pdb_files[failed_job] if os.path.exists(zipfile_path): z = zipfile.ZipFile(zipfile_path, 'r') file_list = z.namelist() for f in file_list: if f.find('.cmd.e') != -1: found_output = True stderr_contents = z.open(f, 'r').read() stdout_contents = z.open(f.replace('.cmd.e', '.cmd.o'), 'r').read() hosts[failed_job] = stdout_contents[stdout_contents.find('<host>') + 6:stdout_contents.find('</host>')].strip() if stderr_contents.find('HESSIN for (i,i):') != -1: assert(stderr_contents.find('G for (i):') != -1) print(stderr_contents[:120]) failed_by_hessin[pdb_id] = failed_by_hessin.get(pdb_id, []) failed_by_hessin[pdb_id].append(failed_job) colortext.error('HESSIN: %s' % pdb_id) elif stderr_contents.find('ERROR: pose.residue(resnum).name1() == wt') != -1: failed_by_residue_mismatch[pdb_id] = failed_by_residue_mismatch.get(pdb_id, []) failed_by_residue_mismatch[pdb_id].append(failed_job) colortext.error('MISMATCH') else: failed_for_another_reason[pdb_id] = failed_for_another_reason.get(pdb_id, []) failed_for_another_reason[pdb_id].append(failed_job) colortext.error('UNKNOWN') see_errors = ask_yes_no("Do you want to see the stderr files for prediction %d?" % failed_job, default_value=False) if see_errors: colortext.warning(f) print(stderr_contents[:300]) print("") if f.find('.mutfile') != -1: assert(mutfile == None) mutfile = z.open(f, 'r').read() mutfiles[failed_job] = mutfile if not found_output: missing_output[pdb_id] = missing_output.get(pdb_id, []) missing_output[pdb_id].append(failed_job) count += 1 colortext.message("*** Report ***") if missing_output: colortext.warning("Missing output: %d jobs" % sum([len(v) for k, v in missing_output.iteritems()])) for k, v in sorted(missing_output.iteritems()): print('%s: %d jobs - %s' % (k, len(v), ', '.join(map(str, sorted(v))))) if failed_by_hessin: colortext.warning("Failed Hessin: %d jobs" % sum([len(v) for k, v in failed_by_hessin.iteritems()])) for k, v in sorted(failed_by_hessin.iteritems()): if pdb_details[k]['Resolution'] != None: print('%s, %0.2fA, %s.' % (k, pdb_details[k]['Resolution'], pdb_details[k]['Techniques'].title())) else: print('%s, %s.' % (k, pdb_details[k]['Techniques'].title())) print('%d/%d jobs failed - %s\n' % (len(v), pdb_details[k]['TotalJobs'], ', '.join(map(str, sorted(v))))) for failed_id in sorted(v): mutations = ddGdb.execute_select("SELECT Prediction.ExperimentID, ExperimentMutation.* FROM Prediction INNER JOIN ExperimentMutation ON Prediction.ExperimentID=ExperimentMutation.ExperimentID WHERE Prediction.ID=%s", parameters=(failed_id,)) mut_str = ', '.join([('%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA'])) for m in mutations]) colortext.printf('%d: %s, experiment #%d. Host = %s' % (failed_id, mut_str, mutations[0]['ExperimentID'], hosts[failed_id]), 'orange') print('') print('') if failed_by_residue_mismatch: colortext.warning("Failed due to residue mismatch: %d jobs" % sum([len(v) for k, v in failed_by_residue_mismatch.iteritems()])) for k, v in sorted(failed_by_residue_mismatch.iteritems()): if pdb_details[k]['Resolution'] != None: colortext.printf('%s, %0.2fA, %s.' % (k, pdb_details[k]['Resolution'], pdb_details[k]['Techniques'].title()), 'cyan') else: colortext.printf('%s, %s.' % (k, pdb_details[k]['Techniques'].title()), 'cyan') print('%d/%d jobs failed - %s\n' % (len(v), pdb_details[k]['TotalJobs'], ', '.join(map(str, sorted(v))))) for failed_id in sorted(v): mutations = ddGdb.execute_select("SELECT ExperimentMutation.* FROM Prediction INNER JOIN ExperimentMutation ON Prediction.ExperimentID=ExperimentMutation.ExperimentID WHERE Prediction.ID=%s", parameters=(failed_id,)) mut_str = ', '.join([('%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA'])) for m in mutations]) colortext.printf('%d: %s' % (failed_id, mut_str), 'orange') print(mutfiles[failed_id]) print('') print('') if failed_for_another_reason: colortext.warning("Failed for an unknown reason: %d jobs" % sum([len(v) for k, v in failed_for_another_reason.iteritems()])) for k, v in sorted(failed_for_another_reason.iteritems()): if pdb_details[k]['Resolution'] != None: print('%s, %0.2fA, %s.' % (k, pdb_details[k]['Resolution'], pdb_details[k]['Techniques'].title())) else: print('%s, %s.' % (k, pdb_details[k]['Techniques'].title())) print('%d/%d jobs failed - %s\n' % (len(v), pdb_details[k]['TotalJobs'], ', '.join(map(str, sorted(v))))) print('%d jobs were marked as failed.' % len(reported_failures)) colortext.warning('%d jobs were marked as failed but had not failed.' % len(did_not_fail)) colortext.error('%d jobs were marked as failed and did fail.\n' % len(actually_failed))
def retrieve_file_from_RCSB(http_connection, resource, silent = True): '''Retrieve a file from the RCSB.''' if not silent: colortext.printf("Retrieving %s from RCSB" % os.path.split(resource)[1], color = "aqua") return http_connection.get(resource)
from klab import colortext # Test chars = 'A' count = 0 for name, data in colortext.colors.iteritems(): colortext.write(name, name) for effect in colortext.EFFECTS_: colortext.write(name, color = name, bgcolor = 'lightblue', effect = effect) print("") colortext.rainbowprint("Rainbow test") colortext.printf("\ntest1", color = 'red') colortext.printf("test2") colortext.bar('blue', 9, suffix = "\n")
def get_data_for_small_large_diagram_for_website(): d = json.loads(read_file('r57471_analysis_input.json')) ddG_connection = db_api.ddG() amino_acids = ddG_connection.get_amino_acids_for_analysis() amino_acid_volumes = {} for aa, details in amino_acids.iteritems(): amino_acid_volumes[aa] = details['van_der_Waals_volume'] assert(len(amino_acid_volumes) == 20) ddGdb = ddgdbapi.ddGDatabase() datasets = ['CuratedProTherm_2014/12/04', 'Guerois_10.1016/S0022-2836(02)00442-4_2002/07/05', 'Kellogg_10.1002/prot.22921_2010/12/03', 'Potapov_10.1093/protein/gzp030_2009/09/01'] multiple_mutations = dict.fromkeys(datasets, 0) net_counts = dict.fromkeys(datasets, 0) SL_counts = {} for dataset in datasets: SL_counts[dataset] = {'SL': 0, 'LS': 0, 'XX': 0} for dataset in datasets: records = ddGdb.execute_select('SELECT * FROM DataSetDDG WHERE DataSetID=%s', parameters=(dataset,)) print('%d records in %s' % (len(records), dataset)) for r in records: experiment_ids = set([s['ExperimentID'] for s in ddGdb.execute_select(''' SELECT ExperimentID FROM DataSetDDGSource INNER JOIN ExperimentAssayDDG ON DataSetDDGSource.ExperimentAssayID=ExperimentAssayDDG.ExperimentAssayID AND DataSetDDGSource.Type=ExperimentAssayDDG.Type INNER JOIN ExperimentAssay ON ExperimentAssayDDG.ExperimentAssayID=ExperimentAssay.ID WHERE DataSetDDGID=%s''', parameters = (r['ID'],))]) if not len(experiment_ids) == 1: colortext.warning('Duplicate record in %s: Dataset record #%d, ExperimentIDs=%s.' % (dataset, r['ID'], ', '.join(map(str, sorted(experiment_ids))))) continue experiment_id = experiment_ids.pop() mutations = ddGdb.execute_select('''SELECT * FROM ExperimentMutation WHERE ExperimentID=%s''', parameters=(experiment_id,)) if len(mutations) > 1: mutation_classes = set() error = False for mutation in mutations: wt, mut = mutation['WildTypeAA'], mutation['MutantAA'] if r['MutationIsReversed']: # Note: For reverse mutations, we need to switch the order since we only store the forward mutation wt, mut = mutation['MutantAA'], mutation['WildTypeAA'] if wt == mut: colortext.warning('Error in %s: Record mutating %s to %s in Experiment #%d.' % (dataset, wt, mut, experiment_id)) error = True elif amino_acid_volumes[wt] < amino_acid_volumes[mut]: mutation_classes.add('SL') elif amino_acid_volumes[wt] > amino_acid_volumes[mut]: mutation_classes.add('LS') else: assert(amino_acid_volumes[wt] == amino_acid_volumes[mut]) mutation_classes.add('XX') if not(error) and len(mutation_classes) == 1: colortext.printf('Multiple mutation case allowed since both mutations have the same class.', 'cyan') SL_counts[dataset][mutation_classes.pop()] += 1 else: multiple_mutations[dataset] += 1 continue # skip multiple mutations else: assert(len(mutations) == 1) mutation = mutations[0] wt, mut = mutation['WildTypeAA'], mutation['MutantAA'] if r['MutationIsReversed']: # Note: For reverse mutations, we need to switch the order since we only store the forward mutation wt, mut = mutation['MutantAA'], mutation['WildTypeAA'] if wt == mut: colortext.warning('Error in %s: Record mutating %s to %s in Experiment #%d.' % (dataset, wt, mut, experiment_id)) continue elif amino_acid_volumes[wt] < amino_acid_volumes[mut]: SL_counts[dataset]['SL'] += 1 elif amino_acid_volumes[wt] > amino_acid_volumes[mut]: SL_counts[dataset]['LS'] += 1 else: assert(amino_acid_volumes[wt] == amino_acid_volumes[mut]) SL_counts[dataset]['XX'] += 1 net_counts[dataset] += 1 #GASCPDTNVEQHMLIKFYRW colortext.message('\nRecords with multiple mutations that were skipped.') pprint.pprint(multiple_mutations) colortext.message('\nNet SL, LS, and XX counts for the datasets.') pprint.pprint(SL_counts) for dataset, details in SL_counts.iteritems(): for type, type_total in details.iteritems(): details[type] = 100 * (float(type_total)/float(net_counts[dataset])) colortext.message('\nNet SL, LS, and XX percentages for the datasets.') pprint.pprint(SL_counts)
def determine_structure_scores(DDG_api, skip_if_we_have_pairs = 50): pp = pprint.PrettyPrinter(indent=4) ddGdb = DDG_api.ddGDB ddGdb_utf = ddgdbapi.ddGDatabase(use_utf = True) # Get the list of completed prediction set completed_prediction_sets = get_completed_prediction_sets(DDG_api) print(completed_prediction_sets) # Create the mapping from the old score types to the ScoreMethod record IDs ScoreMethodMap = {} results = ddGdb_utf.execute('SELECT * FROM ScoreMethod') for r in results: if r['MethodName'] == 'Global' and r['MethodType'] == 'Protocol 16': ScoreMethodMap[("kellogg", "total")] = r['ID'] if r['Authors'] == 'Noah Ollikainen': if r['MethodName'] == 'Local' and r['MethodType'] == 'Position' and r['Parameters'] == u'8Å radius': ScoreMethodMap[("noah_8,0A", "positional")] = r['ID'] if r['MethodName'] == 'Local' and r['MethodType'] == 'Position (2-body)' and r['Parameters'] == u'8Å radius': ScoreMethodMap[("noah_8,0A", "positional_twoscore")] = r['ID'] if r['MethodName'] == 'Global' and r['MethodType'] == 'By residue' and r['Parameters'] == u'8Å radius': ScoreMethodMap[("noah_8,0A", "total")] = r['ID'] # For each completed prediction set, determine the structure scores for prediction_set in completed_prediction_sets: #if prediction_set not in ['Ubiquitin scan: UQ_con_yeast p16']: # continue predictions = ddGdb.execute('SELECT ID, ddG, Scores, status, ScoreVersion FROM Prediction WHERE PredictionSet=%s ORDER BY ID', parameters=(prediction_set,)) num_predictions = len(predictions) # Pass #1: Iterate over all Predictions and make sure that they gave completed and contain all the scores we expect colortext.message('Prediction set: %s' % prediction_set) colortext.warning('Checking that all data exists...') for prediction in predictions: #assert(prediction['status'] == 'done') PredictionID = prediction['ID'] if PredictionID != 72856: continue global_scores = pickle.loads(prediction['ddG']) assert(global_scores) assert(prediction['ScoreVersion'] == 0.23) if not prediction['Scores']: raise Exception("This prediction needs to be scored with Noah's method.") gs2 = json.loads(prediction['Scores']) if True not in set([k.find('noah') != -1 for k in gs2['data'].keys()]): raise Exception("This prediction needs to be scored with Noah's method.") assert (gs2['data']['kellogg'] == global_scores['data']['kellogg']) # Pass #2: Iterate over all completed Predictions with null StructureScores. # For each Prediction, determine and store the structure scores count = 0 for prediction in predictions: count += 1 PredictionID = prediction['ID'] colortext.message('%s: %d of %d (Prediction #%d)' % (prediction_set, count, num_predictions, PredictionID)) #if PredictionID != 72856: #if PredictionID < 73045: continue if prediction['status'] == 'failed': colortext.error('Skipping failed prediction %d.' % PredictionID) continue if prediction['status'] == 'queued': colortext.warning('Skipping queued prediction %d.' % PredictionID) continue if prediction['status'] == 'postponed': colortext.printf('Skipping postponed prediction %d.' % PredictionID, 'cyan') continue # Store the ensemble scores try: global_scores = json.loads(prediction['Scores'])['data'] except: raise colortext.Exception("Failed reading the Scores field's JSON object. The Prediction Status is %(status)s. The Scores field is: '%(Scores)s'." % prediction) for score_type, inner_data in global_scores.iteritems(): for inner_score_type, data in inner_data.iteritems(): components = {} if score_type == 'kellogg' and inner_score_type == 'total': components = data['components'] ddG = data['ddG'] elif score_type == 'noah_8,0A' and inner_score_type == 'positional': ddG = data['ddG'] elif score_type == 'noah_8,0A' and inner_score_type == 'positional_twoscore': ddG = data['ddG'] elif score_type == 'noah_8,0A' and inner_score_type == 'total': ddG = data['ddG'] else: continue raise Exception('Unhandled score types: "%s", "%s".' % (score_type, inner_score_type)) ScoreMethodID = ScoreMethodMap[(score_type, inner_score_type)] new_record = dict( PredictionID = PredictionID, ScoreMethodID = ScoreMethodID, ScoreType = 'DDG', StructureID = -1, # This score is for the Prediction rather than a structure DDG = ddG, ) assert(not(set(components.keys()).intersection(set(new_record.keys())))) new_record.update(components) ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID']) if skip_if_we_have_pairs != None: # Skip this case if we have a certain number of existing records (much quicker since we do not have to extract the binary) num_wt = ddGdb.execute_select("SELECT COUNT(ID) AS NumRecords FROM PredictionStructureScore WHERE PredictionID=%s AND ScoreType='WildType'", parameters=(PredictionID,))[0]['NumRecords'] num_mut = ddGdb.execute_select("SELECT COUNT(ID) AS NumRecords FROM PredictionStructureScore WHERE PredictionID=%s AND ScoreType='Mutant'", parameters=(PredictionID,))[0]['NumRecords'] print(num_wt, num_mut) if num_wt == num_mut and num_mut == skip_if_we_have_pairs: continue # Store the ddg_monomer scores for each structure grouped_scores = DDG_api.get_ddg_monomer_scores_per_structure(PredictionID) for structure_id, wt_scores in sorted(grouped_scores['WildType'].iteritems()): new_record = dict( PredictionID = PredictionID, ScoreMethodID = ScoreMethodMap[("kellogg", "total")], ScoreType = 'WildType', StructureID = structure_id, DDG = None, ) new_record.update(wt_scores) ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID']) for structure_id, wt_scores in sorted(grouped_scores['Mutant'].iteritems()): new_record = dict( PredictionID = PredictionID, ScoreMethodID = ScoreMethodMap[("kellogg", "total")], ScoreType = 'Mutant', StructureID = structure_id, DDG = None, ) new_record.update(wt_scores) ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID']) # Test to make sure that we can pick a best pair of structures (for generating a PyMOL session) assert(DDG_api.determine_best_pair(PredictionID) != None)
def plot(self, table_name, RFunction, output_filename=None, filetype="pdf"): '''Results is expect to be a list of dicts each of which has the keys ExperimentID and ddG.''' if (not self.analysis_tables) or (not table_name): raise Exception("There are no analysis tables to plot.") if not table_name in self.analysis_tables.keys(): raise Exception("The analysis table '%s' does not exist." % table_name) R_return_values = {} gplot = None analysis_table = self.analysis_tables[table_name] if self.quiet_level >= 3: print(table_name) print(RFunction) if len(analysis_table.points) == 1: raise Exception( "The analysis table %s set only has one data point. At least two points are required." % table_name) else: inputfname = self.CreateCSVFile(table_name) if self.quiet_level >= 3: print(inputfname) try: if self.quiet_level >= 2: colortext.printf("Running %s." % RFunction) if output_filename: colortext.printf( "Saving graph as %s with filename %s." % (filetype, output_filename)) output_fname = output_filename if not output_fname: output_fname = rosettahelper.writeTempFile(".", "") R_output = RFunction(inputfname, output_fname, filetype) R_return_values = RUtilities.parse_R_output(R_output) colortext.message(table_name) print(" %s" % str(RFunction)) for k, v in sorted(R_return_values.iteritems()): print(" %s: %s" % (str(k), str(v))) if not output_filename: contents = rosettahelper.readBinaryFile(output_fname) delete_file(output_fname) description = None for file_suffix, details in RFunctions.iteritems(): if details[1] == RFunction: description = details[0] assert (description) gplot = AnalysisObject(table_name, description, filetype, contents) else: gplot = output_filename except Exception, e: import traceback colortext.error(traceback.format_exc()) delete_file(inputfname) raise Exception(e) delete_file(inputfname)
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]): max_processors = get_number_of_processors() rescore_process_file = "/tmp/klab_rescore.txt" parser = OptionParser() parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES") parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID") parser.add_option("-d", "--delete", action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file) parser.add_option("-s", "--set", type='string', dest="prediction_set", help="The prediction set to rescore.") (options, args) = parser.parse_args() if options.delete and os.path.exists(rescore_process_file): print("Removing %s." % rescore_process_file) os.remove(rescore_process_file) num_processes = options.num_processes prediction_set = options.prediction_set process_id = options.process for i in FixedIDs: assert(type(i) == type(1)) # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime' AND Status='done' LIMIT 1 # Check prediction set if not prediction_set: raise colortext.Exception("A prediction set must be specified.") else: if FixedIDs: results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs))) if len(results) != 1: raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results)) else: results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,)) if not results: raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set) if num_processes < 1: raise colortext.Exception("At least 1 processor must be used.") if num_processes > max_processors: raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes)) if num_processes > (max_processors * 0.75): colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors)))) if not(1 <= process_id <= min(max_processors, num_processes)): raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes)) if os.path.exists(rescore_process_file): lines = readFileLines(rescore_process_file) idx = lines[0].find("numprocesses") if idx == -1: raise Exception("Badly formatted %s." % rescore_process_file) existing_num_processes = int(lines[0][idx+len("numprocesses"):]) if existing_num_processes != num_processes: raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes)) for line in [line for line in lines[1:] if line.strip()]: idx = line.find("process") if idx == -1: raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line)) existing_process = int(line[idx+len('process'):]) if process_id == existing_process: raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file)) F = open(rescore_process_file, 'a') F.write("process %d\n" % process_id) F.close() else: F = open(rescore_process_file, 'w') F.write("numprocesses %d\n" % num_processes) F.write("process %d\n" % process_id) F.close() output_dir = os.path.join('rescoring', str(process_id)) if not(os.path.exists(output_dir)): os.makedirs(output_dir) abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir)) print("Running process in %s.\n" % abs_output_dir) ReallyFixedIDs = False results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),)) if not(FixedIDs) and results: raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision) else: # Hacky way to run multiple processes if ReallyFixedIDs: num_to_score = len(remaining_unscored) num_for_this_to_score = num_to_score / num_processes IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score] results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score)))) elif FixedIDs: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1)) else: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1)) count = 0 cases_computed = 0 total_time_in_secs = 0 number_of_cases_left = len(results) * len(radii) failed_cases = [] colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen') for r in results: t = Timer() t.add('Preamble') inner_count = 0 mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],)) mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations]) extracted_data = False details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID'])) experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID'] print('Experiment PDB file ID = %s' % experiment_pdbID) pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID'] print('UserDataSetExperiment PDB file ID = %s' % pdbID) count += 1 if True:#len(mutations) == 1: timestart = time.time() #mutation = mutations[0] dbchains = sorted(set([mutation['Chain'] for mutation in mutations])) # todo: note: assuming monomeric structures here assert(len(dbchains) == 1) dbchain = dbchains[0] #mutantaa = mutation['MutantAA'] ddG_dict = json.loads(r['Scores']) kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG'] #assert(ddG_dict['version'] == current_score_revision) all_done = True for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if not(ddG_dict['data'].get(score_name)): all_done = False else: cases_computed += 1 number_of_cases_left -= 1 if all_done: print('Prediction %d: done.' % r["ID"]) continue # Extract data t.add('Grab data') #archivefile = None #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value'] #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID']) #print(job_data_path) #assert(os.path.exists(job_data_path)) #archivefile = readBinaryFile(job_data_path) archivefile = DDG_interface.getData(r['ID']) zipfilename = os.path.join(output_dir, "%d.zip" % r['ID']) F = open(zipfilename, "wb") F.write(archivefile) F.close() t.add('Extract data') zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED) tmpdir = None repacked_files = [] mutant_files = [] rosetta_resids = [] try: tmpdir = makeTemp755Directory(output_dir) highestIndex = -1 foundResfile = False foundMutfile = False presumed_mutation = None for fname in sorted(zipped_content.namelist()): if fname.endswith(".pdb"): if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']): structnum = int(fname[fname.rindex('_')+1:-4]) if fname.startswith("%s/mut_" % r['ID']): if presumed_mutation: assert(presumed_mutation == os.path.split(fname)[1].split('_')[1]) else: presumed_mutation = os.path.split(fname)[1].split('_')[1] newfname = 'mutant_%02d' % structnum if fname.startswith("%s/repacked_" % r['ID']): newfname = 'repacked_%02d' % structnum highestIndex = max(highestIndex, structnum) newfilepath = os.path.join(tmpdir, newfname) writeFile(newfilepath, zipped_content.read(fname)) if fname.startswith("%s/mut_" % r['ID']): mutant_files.append(newfilepath) if fname.startswith("%s/repacked_" % r['ID']): repacked_files.append(newfilepath) #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']): # writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname)) if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.') foundResfile = True lines = zipped_content.read(fname).split("\n") assert(len(lines) == 3) assert(lines[0] == "NATAA") assert(lines[1] == "start") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 4) rosetta_resid = resfile_mutation[0] rosetta_chain = resfile_mutation[1] rosetta_mutaa = resfile_mutation[3] assert(mutantaa == rosetta_mutaa) assert(dbchain == rosetta_chain) assert(resfile_mutation[2] == 'PIKAA') assert(len(rosetta_mutaa) == 1) if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): foundMutfile = True lines = zipped_content.read(fname).split("\n") assert(lines[0].startswith('total ')) num_mutations = int(lines[0][6:]) assert(lines[1] == str(num_mutations)) # todo: note: assuming monomeric structures here rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] resfile_mutations = lines[2:] for resfile_mutation in resfile_mutations: resfile_mutation = resfile_mutation.split(" ") assert(len(resfile_mutation) == 3) rosetta_resids.append(resfile_mutation[1]) rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) # Make sure the wtaa->mutantaa types match the structures assert(not(foundResfile)) if not foundMutfile: raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.') input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],)) assert(len(input_files) == 1) lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n") #lines = regenerate_mutfile(r['ID']).split("\n") assert(len(lines) == 3) assert(lines[0] == "total 1") assert(lines[1] == "1") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 3) rosetta_resid = resfile_mutation[1] rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation) fullresids = [] for rosetta_resid in rosetta_resids: fullresid = None if rosetta_resid.isdigit(): fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid) else: assert(False) fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid) fullresids.append(fullresid) resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],)) assert(len(resultst1) == 1) ExperimentIDt1 = resultst1[0]['ExperimentID'] UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID'] if UserDataSetExperimentIDt1: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,)) else: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,)) assert(len(resultst2) == 1) prediction_PDB_ID = resultst2[0]['PDBFileID'] if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']: for fullresid in fullresids: wtaa = None for m in mutations: # Hack for ub_RPN13 if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_RPN13_yeast elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_OTU elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_OTU_yeast elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_UQcon elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501 wtaa = m['WildTypeAA'] # Hack for uby_UQcon elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287): wtaa = m['WildTypeAA'] elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip(): wtaa = m['WildTypeAA'] if (wtaa == None): colortext.error(prediction_PDB_ID) colortext.error('wtaa == None') colortext.error('fullresid = %s' % str(fullresid)) colortext.error(str(mutations)) colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids]) #sys.exit(0) assert(wtaa != None) assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa) #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa) for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if ddG_dict['data'].get(score_name): print('Radius %0.1f: done.' % radius) continue cases_computed += 1 number_of_cases_left -= 1 t.add('Radius %0.3f: repacked' % radius) colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange') repacked_score = NoahScore() repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.message("Repacked") print(repacked_score) t.add('Radius %0.3f: mutant' % radius) mutant_score = NoahScore() mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.printf("Mutant", color = 'cyan') print(mutant_score) t.add('Radius %0.3f: postamble' % radius) colortext.printf("ddG", color = 'lightpurple') ddg_score = repacked_score.ddg(mutant_score) print(ddg_score) colortext.printf("Liz's ddG", color = 'yellow') print("Total score: %0.3f" % kellogg_ddG) ddG_dict['version'] = '0.23' if ddG_dict['version'] == '0.1': ddG_dict['version'] = '0.21' ddG_dict['data'] = { 'kellogg' : { 'total' : ddG_dict['data'], }, 'noah': { 'total' : {'ddG' : ddg_score.total}, 'positional' : {'ddG' : ddg_score.positional}, 'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore}, }, } elif ddG_dict['version'] == '0.2': ddG_dict['version'] = '0.21' ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.22': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.23': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore jsonified_ddG = json.dumps(ddG_dict) ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],)) t.add('Cleanup') shutil.rmtree(tmpdir) os.remove(zipfilename) except Exception, e: print("Exception! In prediction %d" % r['ID'], str(e)) failed_cases.append(r['ID']) import traceback print(traceback.format_exc()) if tmpdir: shutil.rmtree(tmpdir) total_time_in_secs += t.sum() average_time_taken = float(total_time_in_secs)/float(cases_computed or 1) estimate_remaining_time = number_of_cases_left * average_time_taken t.stop() colortext.printf("**Profile**", 'orange') print(t) colortext.message("Time taken for this case: %0.2fs." % t.sum()) colortext.message("Average time taken per case: %0.2fs." % average_time_taken) colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60)) print("\n")