def addLinsJobs(PredictionSet, ProtocolID): raise colortext.Exception("Do you really want to run this?") colortext.printf("\nAdding Lin's mutations to %s prediction set." % PredictionSet, "lightgreen") KeepHETATMLines = False FilterTester.openDB() # Filter by the DummySource set of experiments er1 = ExperimentResultSet(ddGdb) ef1 = ExperimentFilter() ef1.setSource(ExperimentFilter.DummySource) er1.addFilter(ef1) # Filter by the particular PDB sr = StructureResultSet(ddGdb, 'WHERE PDB_ID="3K0NB_lin"') er1 = ExperimentResultSet.fromIDs(ddGdb, er1.getFilteredIDs()).filterBySet(sr) FilterTester.test(er1) experimentIDs = sorted(list(er1.getFilteredIDs())) colortext.message("\nThe number of unique experiments is %d.\n" % len(experimentIDs)) ddG_connection = db.ddG() count = 0 for experimentID in experimentIDs: ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True) count += 1 if count >= 10: colortext.write(".") colortext.flush() count = 0 print("")
def check_JSON_dataset(dataset_ID): # I substitute PDB IDs so this function does a simple check to make sure that the mutations still look okay (this is a simple check - the mutations may not be correct) colortext.message('Reading PDB IDs...') PDB_ids = set([record['PDBFileID'] for record in JSON_datasets[dataset_ID]['data']]) colortext.message('Loading %s PDBs...' % len(PDB_ids)) for PDB_id in PDB_ids: if not(cached_pdbs.get(PDB_id)): print('Reading %s' % PDB_id) colortext.write('.', 'yellow') sys.stdout.flush() cached_pdbs[PDB_id] = PDB(ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(PDB_id,))[0]['Content']) print('') count = 0 for record in JSON_datasets[dataset_ID]['data']: pdb_id = record['PDBFileID'] p = cached_pdbs[pdb_id] #colortext.printf('pdb_id', color='cyan') #pprint.pprint(record) #pprint.pprint(record['Mutations']) for m in record['Mutations']: chain_id = m['Chain'] residue_id = m['ResidueID'] residue_aa = m['WildTypeAA'] padded_id = ChainResidueID2String(chain_id, residue_id) if p.atom_sequences[chain_id][padded_id].ResidueAA != residue_aa: print(pdb_id, chain_id, residue_id, residue_aa) print(p.atom_sequences[chain_id][padded_id].ResidueAA, residue_aa) assert(p.atom_sequences[chain_id][padded_id].ResidueAA == residue_aa) count += 1 print('Successfully checked %d datapoints.' % count)
def runLizsSet(PredictionSet, ProtocolID): raise colortext.Exception("Do you really want to run this?") colortext.printf("\nAdding Liz's data set to %s prediction set." % PredictionSet, "lightgreen") KeepHETATMLines = False FilterTester.openDB() # Filter by the DummySource set of experiments er1 = ExperimentResultSet(ddGdb) ef1 = ExperimentFilter() ef1.setSource(ExperimentFilter.LizKellogg) er1.addFilter(ef1) FilterTester.test(er1) experimentIDs = sorted(list(er1.getFilteredIDs())) colortext.message("\nThe number of unique experiments is %d.\n" % len(experimentIDs)) ddG_connection = db.ddG() count = 0 for experimentID in experimentIDs: ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True) count += 1 if count >= 10: colortext.write(".") colortext.flush() count = 0 print("")
def get_sqlalchemy_schema(self, restrict_to_tables = []): colortext.warning(' *** MySQL schema ***') schema = [] #print(self.intermediate_schema) typedefs = {'sqlalchemy.types' : set(), 'sqlalchemy.dialects.mysql' : set()} for tbl in self.tables: if (not restrict_to_tables) or (tbl in restrict_to_tables): colortext.message(tbl) print(self.db_interface.execute("SHOW CREATE TABLE %s" % tbl))[0]['Create Table'] print('') code = [] code.append("class %s(DeclarativeBase):" % tbl) code.append(" __tablename__ = '%s'\n" % tbl) #print('\n'.join(code)) intermediate_table = self.intermediate_schema[tbl] for field in intermediate_table: s = field.to_sql_alchemy(typedefs) code.append(' {0}'.format(s)) #print(s) code.append('\n') #print('') schema.extend(code) imports = [] for module, types in sorted(typedefs.iteritems()): imports.append('from %s import %s' % (module, ', '.join(sorted(types)))) schema = imports + [''] + schema colortext.warning('*** SQLAlchemy class definitions ***') print('\n'.join(schema))
def count_num_residues_in_active_jobs(): '''I wrote this function to try to narrow down into which jobs ran the longest as I suspect that this is due to long PDB chains.''' ddGdb = ddgdbapi.ddGDatabase() active_jobs = ddGdb.execute_select("SELECT DISTINCT ExperimentID FROM Prediction WHERE Status='active'") colortext.message("\n%d jobs are active" % len(active_jobs)) from klab.bio.rcsb import parseFASTAs chains_in_active_jobs = {} PDB_chain_lengths ={} for active_job in active_jobs: r = ddGdb.execute_select('SELECT PDBFileID, Chain FROM Experiment INNER JOIN ExperimentChain ON ExperimentID=Experiment.ID WHERE ExperimentID=%s', parameters=(active_job['ExperimentID'])) assert(len(r) == 1) r = r[0] key = (r['PDBFileID'], r['Chain']) if PDB_chain_lengths.get(key) == None: fasta = ddGdb.execute_select("SELECT FASTA FROM PDBFile WHERE ID=%s", parameters = (r['PDBFileID'],)) assert(len(fasta) == 1) fasta = fasta[0]['FASTA'] f = parseFASTAs(fasta) PDB_chain_lengths[key] = len(f[r['PDBFileID']][r['Chain']]) chain_length = PDB_chain_lengths[key] chains_in_active_jobs[key] = chains_in_active_jobs.get(key, [chain_length, 0]) chains_in_active_jobs[key][1] += 1 if chains_in_active_jobs: colortext.message("Chains in currently active jobs:\n") print("PDB\tChain\tChain SEQRES length\tJobs remaining") for k,v in sorted(chains_in_active_jobs.iteritems(), key=lambda x: x[1][0]): print("%s\t %s\t%s\t%s" % (k[0], k[1], str(v[0]).center(19), str(v[1]).center(14)))
def match(self, other): ''' This is a noisy terminal-printing function at present since there is no need to make it a proper API function.''' colortext.message("FASTA Match") for frompdbID, fromchains in sorted(self.iteritems()): matched_pdbs = {} matched_chains = {} for fromchain, fromsequence in fromchains.iteritems(): for topdbID, tochains in other.iteritems(): for tochain, tosequence in tochains.iteritems(): if fromsequence == tosequence: matched_pdbs[topdbID] = matched_pdbs.get(topdbID, set()) matched_pdbs[topdbID].add(fromchain) matched_chains[fromchain] = matched_chains.get(fromchain, []) matched_chains[fromchain].append((topdbID, tochain)) foundmatches = [] colortext.printf(" %s" % frompdbID, color="silver") for mpdbID, mchains in matched_pdbs.iteritems(): if mchains == set(fromchains.keys()): foundmatches.append(mpdbID) colortext.printf(" PDB %s matched PDB %s on all chains" % (mpdbID, frompdbID), color="white") if foundmatches: for fromchain, fromsequence in fromchains.iteritems(): colortext.printf(" %s" % (fromchain), color = "silver") colortext.printf(" %s" % (fromsequence), color = self.unique_sequences[fromsequence]) mstr = [] for mchain in matched_chains[fromchain]: if mchain[0] in foundmatches: mstr.append("%s chain %s" % (mchain[0], mchain[1])) colortext.printf(" Matches: %s" % ", ".join(mstr)) else: colortext.error(" No matches found.")
def main(): # Create up the database session dbi = DatabaseInterface(can_email = True) tsession = dbi.get_session() # Create a map from usernames to the database IDs (typically initials) user_map = {} for u in tsession.query(Users): user_map[u.lab_username] = u.ID # Read the import path from the database colortext.message('\nPrimers import script') colortext.pcyan('Database admin contacts: {0}'.format(', '.join(dbi.get_admin_contacts()))) colortext.warning('Registered users: {0}\n'.format(', '.join( ['{0} ({1})'.format(v, k) for k, v in sorted(user_map.iteritems(), key = lambda x: x[1])]))) errors = [] import_path = tsession.query(DBConstants).filter(DBConstants.Parameter == u'import_path').one().Value import_path_folders = sorted([d for d in os.listdir(import_path) if os.path.isdir(os.path.join(import_path,d))]) for ipf in import_path_folders: if ipf in user_map: user_folder = os.path.join(import_path, ipf) user_id = user_map[ipf] primers_file = os.path.join(user_folder, 'primers.tsv') if os.path.exists(primers_file): case_errors = [] try: parse(dbi, primers_file, user_id, case_errors) if case_errors: errors.append("Errors occurred processing '{0}':\n\t{1}".format(primers_file, '\n\t'.join(case_errors))) colortext.warning(errors[-1]) except Exception, e: errors.append("Errors occurred processing '{0}': {1}\n\t{2}\n{3}".format(primers_file, str(e), '\n\t'.join(case_errors), traceback.format_exc())) colortext.warning('Error: {0}\n{1}'.format(str(e), traceback.format_exc()))
def showAllEligibleProTherm(PredictionSet, ProtocolID, KeepHETATMLines): #inserter = JobInserter() colortext.printf("\nAdding ProTherm mutations to %s prediction set." % PredictionSet, "lightgreen") #ddGdb = dbi.ddGDatabase() MAX_RESOLUTION = 2.1 MAX_NUMRES_PROTHERM = 350 MAX_STANDARD_DEVIATION = 1.0 FilterTester.openDB() if False: t1 = time.time() er1 = ExperimentResultSet(ddGdb) er1.addFilter(ExperimentFilter.OnSource(ExperimentFilter.ProTherm)) er1.addFilter(ExperimentFilter.NumberOfMutations(1, 1)) er1.addFilter(ExperimentFilter.NumberOfChains(1, 1)) er1.addFilter(ExperimentFilter.StandardDeviation(None, MAX_STANDARD_DEVIATION)) er1.addFilter(StructureFilter.Resolution(None, MAX_RESOLUTION)) er1.addFilter(StructureFilter.Techniques(StructureFilter.XRay)) FilterTester.test(er1) t2 = time.time() print(t2 - t1) # This method usually takes around 65% of the time as the method above t1 = time.time() ef1 = ExperimentFilter() ef1.setSource(ExperimentFilter.ProTherm) er1 = ExperimentResultSet(ddGdb) er1.addFilter(ExperimentFilter.OnSource(ExperimentFilter.ProTherm)) FilterTester.test(er1) ef1.setNumberOfMutations(1, 1) ef1.setNumberOfChains(1, 1) ef1.setStandardDeviation(None, MAX_STANDARD_DEVIATION) sf1 = StructureFilter() sf1.setResolution(None, MAX_RESOLUTION) sf1.setTechniques(StructureFilter.XRay) er1 = ExperimentResultSet(ddGdb) er1.addFilter(ef1) er1.addFilter(sf1) FilterTester.test(er1) t2 = time.time() print(t2 - t1) experimentIDs = sorted(list(er1.getFilteredIDs())) colortext.message("\nThe number of unique ProTherm experiments with:\n\t- one mutation;\n\t- structures solved by X-ray diffraction and with <= %d residues;\n\t- a maximum standard deviation in experimental results of <= %0.2f;\n\t- and a resolution of <= %0.2f Angstroms.\nis %d.\n" % (MAX_NUMRES_PROTHERM, MAX_STANDARD_DEVIATION, MAX_RESOLUTION, len(experimentIDs))) ddG_connection = db.ddG() count = 0 sys.exit(0) print("") for experimentID in experimentIDs: ddG_connection.addPrediction(experimentID, PredictionSet, ProtocolID, KeepHETATMLines, StoreOutput = True) count += 1 if count >= 10: colortext.write(".") colortext.flush() count = 0 print("")
def updateBirthdays(self, bdays): raise Exception('update') eventstbl = self.getEventsTable("main") for dt, details in sorted(bdays.iteritems()): bdaykey = datetime(dt.year, dt.month, dt.day) if eventstbl.get((bdaykey, details["title"])): if str(eventstbl[(bdaykey, details["title"])]["title"]) == str(details["title"]): continue colortext.message("adding " + details["title"]) self.addBirthday(dt, details["title"], details["location"])
def plot(self, table_name, RFunction, output_filename = None, filetype = "pdf"): '''Results is expect to be a list of dicts each of which has the keys ExperimentID and ddG.''' if (not self.analysis_tables) or (not table_name): raise Exception("There are no analysis tables to plot.") if not table_name in self.analysis_tables.keys(): raise Exception("The analysis table '%s' does not exist." % table_name) R_return_values = {} gplot = None analysis_table = self.analysis_tables[table_name] if self.quiet_level >= 3: print(table_name) print(RFunction) if len(analysis_table.points) == 1: raise Exception("The analysis table %s set only has one data point. At least two points are required." % table_name) else: inputfname = self.CreateCSVFile(table_name) if self.quiet_level >= 3: print(inputfname) try: if self.quiet_level >= 2: colortext.printf("Running %s." % RFunction) if output_filename: colortext.printf("Saving graph as %s with filename %s." % (filetype, output_filename)) output_fname = output_filename if not output_fname: output_fname = rosettahelper.writeTempFile(".", "") R_output = RFunction(inputfname, output_fname, filetype) R_return_values = RUtilities.parse_R_output(R_output) colortext.message(table_name) print(" %s" % str(RFunction)) for k, v in sorted(R_return_values.iteritems()): print(" %s: %s" % (str(k), str(v))) if not output_filename: contents = rosettahelper.readBinaryFile(output_fname) delete_file(output_fname) description = None for file_suffix, details in RFunctions.iteritems(): if details[1] == RFunction: description = details[0] assert(description) gplot = AnalysisObject(table_name, description, filetype, contents) else: gplot = output_filename except Exception, e: import traceback colortext.error(traceback.format_exc()) delete_file(inputfname) raise Exception(e) delete_file(inputfname)
def fix_1AYE_InputFiles(prediction_set): '''This is a once-off function which should only be run once per prediction set as each run changes the mutfile and this change should only occur once.''' import pickle ddGdb = ddgdbapi.ddGDatabase() BadPredictions = sorted(set([(r['PredictionID'], r['Status']) for r in ddGdb.execute_select(''' SELECT Prediction.ID AS PredictionID, Status FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperiment.ID=Prediction.UserDataSetExperimentID WHERE PredictionSet=%s AND PDBFileID='1AYE' ''', parameters=(prediction_set,))])) BadPredictionIDs = sorted(set([r[0] for r in BadPredictions])) print(BadPredictions) num_active = len([r for r in BadPredictions if r[1] == 'active']) num_queued = len([r for r in BadPredictions if r[1] == 'queued']) statuses = sorted(set([r[1] for r in BadPredictions])) if ('active' in statuses) or ('queued' in statuses): colortext.error("Cannot proceed - there are %d active jobs and %d queued in the list that need to be fixed up. Stop the DDG scheduler, remove the queued constraint, and rerun this function. " % (num_active, num_queued)) if num_active: print("%d active jobs: %s" % (num_active, ", ".join([str(r[0]) for r in BadPredictions if r[1] == 'active']))) if num_queued: print("%d queued jobs: %s" % (num_queued, ", ".join([str(r[0]) for r in BadPredictions if r[1] == 'queued']))) return for PredictionID in BadPredictionIDs: r = ddGdb.execute_select("SELECT InputFiles FROM Prediction WHERE ID=%s", parameters=(PredictionID,)) assert(len(r) == 1) r = r[0] InputFiles = pickle.loads(r['InputFiles']) assert(InputFiles.keys() == ['MUTFILE']) mutfile = InputFiles['MUTFILE'] colortext.message("\n%d" % PredictionID) colortext.warning('original') print(mutfile) lines = mutfile.split("\n") assert(lines[0].startswith('total')) num_muts = int(lines[0][5:]) assert(lines[1] == str(num_muts)) for x in range(2, num_muts + 2): mutline = lines[x] tokens = mutline.split() tokens[1] = str(int(tokens[1]) - 1) lines[x] = " ".join(tokens) new_mutfile = "\n".join(lines) colortext.warning('fixed') print(new_mutfile) p = pickle.dumps({'MUTFILE' : new_mutfile})
def test_sequences(b, sequences): failed_cases = [] c = 0 for sequence in sequences: try: c += 1 colortext.message('\n{0}/{1}: {2}'.format(c, len(sequences), sequence)) hits = b.by_sequence(sequence) if hits: colortext.warning('{0} hits: {1}'.format(len(hits), ','.join(hits))) else: colortext.warning('No hits') except Exception, e: colortext.error('FAILED') failed_cases.append((sequence, str(e), traceback.format_exc()))
def updateEvents(self, calendar_id, newEvents): currentEvents = self.getEventsTable(calendar_id) #colortext.message(newEvents) #colortext.warning(currentEvents) # Events to remove toRemove = [] for startdateTitle, event in sorted(currentEvents.iteritems()): if event["title"].find("birthday") != -1: # Don't remove birthdays continue if newEvents.get(startdateTitle): newEvent = newEvents[startdateTitle] if newEvent["enddate"] == event["enddate"]: if event["location"].startswith(newEvent["location"]): if str(newEvent["title"]) == str(event["title"]): # Don't remove events which are in both newEvents and the calendar continue # Remove events which are on the calendar but not in newEvents toRemove.append(startdateTitle) # Events to add toAdd = [] for startdateTitle, event in sorted(newEvents.iteritems()): if currentEvents.get(startdateTitle): currentEvent = currentEvents[startdateTitle] if currentEvent["enddate"] == event["enddate"]: if currentEvent["location"].startswith(event["location"]): if str(currentEvent["title"]) == str(event["title"]): # Don't add events which are in both newEvents and the calendar continue # Add events which are in newEvents but not on the calendar toAdd.append(startdateTitle) if toRemove: colortext.error("Removing these %d events:" % len(toRemove)) for dtTitle in toRemove: colortext.warning(dtTitle) self.removeEvent(calendar_id, currentEvents[dtTitle]["event"].id) if toAdd: colortext.message("Adding these %d events:" % len(toAdd)) for dtTitle in toAdd: newEvent = newEvents[dtTitle] #print(dtTitle, newEvent) self.addNewEvent(calendar_id, dtTitle[0], newEvent["enddate"], newEvent["location"], newEvent["title"])
def test_pdb_files(b, pdb_ids): failed_cases = [] c = 0 for pdb_id in pdb_ids: try: c += 1 colortext.message('\n{0}/{1}: {2}'.format(c, len(pdb_ids), pdb_id)) hits = b.by_pdb(pdb_id) if hits: colortext.warning('{0} hits: {1}'.format(len(hits), ','.join(hits))) else: colortext.warning('No hits') except Exception, e: colortext.error('FAILED') failed_cases.append((pdb_id, str(e), traceback.format_exc()))
def CreateAnalysisTables(self): ddGdb = self.ddGdb PredictionSet = self.PredictionSet predictions = PredictionScores(ddGdb, PredictionSet, self.ddG_score_type, score_cap = self.score_cap) predicted_scores = predictions.Predictions s = "Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'. " % (predictions.NumberOfPredictions, predictions.PredictionSet.replace("_", "\_"), predictions.UserDataSetName) if self.score_cap: s += "Running analysis over the following analysis sets: '%s' with predicted scores capped at +-%0.2f." % (join(predictions.AnalysisSets, "', '"), self.score_cap) else: s += "Running analysis over the following analysis sets: '%s'." % (join(predictions.AnalysisSets, "', '")) self.description.append(("black", s)) if self.quiet_level >= 1: colortext.message("Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'." % (predictions.NumberOfPredictions, predictions.PredictionSet, predictions.UserDataSetName)) colortext.message("Running analysis over the following analysis sets: '%s'." % (join(predictions.AnalysisSets, "', '"))) analysis_tables = {} # Analyze data for for AnalysisSet in predictions.AnalysisSets: analysis_table = AnalysisTable() experiments = UserDataSetExperimentalScores(ddGdb, predictions.UserDataSetID, AnalysisSet) count = 0 numMissing = 0 for section, sectiondata in sorted(experiments.iteritems()): for recordnumber, record_data in sorted(sectiondata.iteritems()): count += 1 PDB_ID = record_data["PDB_ID"] ExperimentID = record_data["ExperimentID"] ExperimentalDDG = record_data["ExperimentalDDG"] if predicted_scores.get(ExperimentID) and predicted_scores[ExperimentID].get(PDB_ID): PredictedDDG = predicted_scores[ExperimentID][PDB_ID]["PredictedDDG"] analysis_table.add(AnalysisPoint(ExperimentalDDG, PredictedDDG, ExperimentID = ExperimentID, PDB_ID = PDB_ID, section = section, recordnumber = recordnumber)) else: numMissing += 1 if numMissing > 0 and self.quiet_level >= 1: self.description.append(("Bittersweet", "Missing %d predictions out of %d records for analysis set %s." % (numMissing, count, AnalysisSet))) colortext.warning("Missing %d predictions out of %d records for analysis set %s." % (numMissing, count, AnalysisSet)) analysis_tables[AnalysisSet] = analysis_table self.analysis_tables = analysis_tables
def print_existing_experimental_data(): # These PDB files existed in the database before the import so I am interested to see whether any of the experimental # data matches the requested predictions print('') ppi_api = get_ppi_api() for pdb_id in ['1A2K', '1K5D', '1I2M']: colortext.message(pdb_id) complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id) if complex_ids: assert(len(complex_ids) == 1) complex_id = complex_ids[0] colortext.warning('Complex #{0}'.format(complex_id)) pprint.pprint(ppi_api.get_complex_details(complex_id)) mutation_records = mutations_dataframe[mutations_dataframe['pdb'].str.contains(pdb_id)]# mutations_dataframe.loc[mutations_dataframe['pdb'][0:4] == pdb_id] with pandas.option_context('display.max_rows', None, 'display.max_columns', None): print mutation_records # There is no experimental binding affinity data at present assert(not(ppi_api.DDG_db.execute_select('SELECT * FROM PPMutagenesisPDBMutation WHERE PPComplexID IN (202, 119, 176) ORDER BY PPComplexID, Chain, ResidueID, MutantAA')))
def remove_all_cancelled_events(self, calendar_ids = []): for calendar_id in calendar_ids or self.calendar_ids: colortext.message('Removing cancelled events in %s' % calendar_id) events = self.service.events().list(calendarId = self.configured_calendar_ids[calendar_id]).execute() print(len(events['items'])) for event in events['items']: dt = None nb = DeepNonStrictNestedBunch(event) if nb.status == 'cancelled': if nb.recurringEventId: colortext.warning(nb.recurringEventId) # Retrieve all occurrences of the recurring event within the timeframe start_time = datetime(year=2010, month=1, day=1, tzinfo=self.timezone).isoformat() end_time = datetime(year=2015, month=1, day=1, tzinfo=self.timezone).isoformat() for e in self.get_recurring_events(calendar_id, nb.id, start_time, end_time, maxResults = 10): print(e) else: colortext.warning(nb)
def add_bidet(self): raise Exception('update') main_calendar = GoogleCalendar.from_file('/admin/calendars.json', ['main']) notices_calendar = GoogleCalendar.from_file('/admin/calendars.json', ['notices']) timezone = main_calendar.timezone event_ids = set() seen_notices = set() for year in range(2014, 2017): #for year in range(2014, 2015): colortext.message('\n\nTagging events in %d:\n' % year) extra_days = 0 if year % 4 == 0: extra_days = 1 start_time = datetime(year=year, month=1, day=1, hour=0, minute=0, second=0, tzinfo=timezone) end_time = start_time + timedelta(days = 730 + extra_days, seconds = -1) start_time, end_time = start_time.isoformat(), end_time.isoformat() #main_meetings = main_calendar.get_events(start_time, end_time, ignore_cancelled = True, get_recurring_events_as_instances = False) #for m in main_meetings: # if m.extendedProperties.shared: # event_type = m.extendedProperties.shared['event_type'] # if event_type == 'Birthday' notices = notices_calendar.get_events(start_time, end_time, ignore_cancelled = True, get_recurring_events_as_instances = False) for n in notices: if n.id in seen_notices: continue seen_notices.add(n.id) if n.extendedProperties.shared and n.extendedProperties.shared.event_type: event_type = n.extendedProperties.shared['event_type'] if event_type == 'Birthday': print(n.summary, n.id) print(n.start) event_body = main_calendar.service.events().get(calendarId = main_calendar.configured_calendar_ids["notices"], eventId=n.id).execute() event_body['gadget'] = { 'display' : 'icon', 'iconLink' : 'https://guybrush.ucsf.edu/images/cake.png', 'title' : n.summary, #'type' : 'application/x-google-gadgets+xml', } created_event = main_calendar.service.events().update(calendarId = main_calendar.configured_calendar_ids["notices"], eventId = n.id, body = event_body).execute()
def addNewEvent(self, calendar_id, startdate, enddate, location, title): colortext.message("\nAdding %s on %s at %s" % (title, startdate, location)) #start_time = startdate.strftime('%Y-%m-%dT%H:%M:%S').isoformat() #end_time = enddate.strftime('%Y-%m-%dT%H:%M:%S').isoformat() start_time = startdate.isoformat() end_time = enddate.isoformat() loc = location if loc.startswith("Tahoe"): loc = "%s, 10 minutes outside Truckee, CA @ 39.328455,-120.184078" % loc else: if location.startswith("BH "): loc = "%s, Byers Hall" % loc loc = "%s, removeEvent/Mission Bay, San Francisco, CA @ 37.767952,-122.392214" % loc for i in range(3): try: self.service.events().insert( calendarId = self.configured_calendar_ids[calendar_id], body = { "start" : { "timeZone" : self.timezone_string, "dateTime" : start_time, }, "end" : { "timeZone" : self.timezone_string, "dateTime" : end_time, }, "location" : loc, "summary" : title, "description" : title }).execute() break except Exception, e: colortext.error("An error occurred:") colortext.error(traceback.format_exc()) colortext.error(e) colortext.error("Trying again.") time.sleep(2)
def test_pdbml_speed(): test_cases = [ '1WSY', '1YGV', '487D', '1HIO', '1H38', '3ZKB', ] for test_case in test_cases: print("\n") colortext.message("Creating PDBML object for %s" % test_case) #PDBML.retrieve(test_case, cache_dir = cache_dir) print("") colortext.printf("Using the old minidom class", color = 'cyan') t1 = time.clock() p_minidom = PDBML_slow.retrieve(test_case, cache_dir = cache_dir) t2 = time.clock() colortext.message("Done in %0.2fs!" % (t2 - t1)) print("") colortext.printf("Using the new sax class", color = 'cyan') t1 = time.clock() p_sax = PDBML.retrieve(test_case, cache_dir = cache_dir) t2 = time.clock() colortext.message("Done in %0.2fs!" % (t2 - t1)) colortext.write("\nEquality test: ", color = 'cyan') try: assert(p_minidom.atom_to_seqres_sequence_maps.keys() == p_sax.atom_to_seqres_sequence_maps.keys()) for c, s_1 in p_minidom.atom_to_seqres_sequence_maps.iteritems(): s_2 = p_sax.atom_to_seqres_sequence_maps[c] assert(str(s_1) == str(s_2)) colortext.message("passed\n") except: colortext.error("failed\n")
def determine_structure_scores(DDG_api, skip_if_we_have_pairs = 50): pp = pprint.PrettyPrinter(indent=4) ddGdb = DDG_api.ddGDB ddGdb_utf = ddgdbapi.ddGDatabase(use_utf = True) # Get the list of completed prediction set completed_prediction_sets = get_completed_prediction_sets(DDG_api) print(completed_prediction_sets) # Create the mapping from the old score types to the ScoreMethod record IDs ScoreMethodMap = {} results = ddGdb_utf.execute('SELECT * FROM ScoreMethod') for r in results: if r['MethodName'] == 'Global' and r['MethodType'] == 'Protocol 16': ScoreMethodMap[("kellogg", "total")] = r['ID'] if r['Authors'] == 'Noah Ollikainen': if r['MethodName'] == 'Local' and r['MethodType'] == 'Position' and r['Parameters'] == u'8Å radius': ScoreMethodMap[("noah_8,0A", "positional")] = r['ID'] if r['MethodName'] == 'Local' and r['MethodType'] == 'Position (2-body)' and r['Parameters'] == u'8Å radius': ScoreMethodMap[("noah_8,0A", "positional_twoscore")] = r['ID'] if r['MethodName'] == 'Global' and r['MethodType'] == 'By residue' and r['Parameters'] == u'8Å radius': ScoreMethodMap[("noah_8,0A", "total")] = r['ID'] # For each completed prediction set, determine the structure scores for prediction_set in completed_prediction_sets: #if prediction_set not in ['Ubiquitin scan: UQ_con_yeast p16']: # continue predictions = ddGdb.execute('SELECT ID, ddG, Scores, status, ScoreVersion FROM Prediction WHERE PredictionSet=%s ORDER BY ID', parameters=(prediction_set,)) num_predictions = len(predictions) # Pass #1: Iterate over all Predictions and make sure that they gave completed and contain all the scores we expect colortext.message('Prediction set: %s' % prediction_set) colortext.warning('Checking that all data exists...') for prediction in predictions: #assert(prediction['status'] == 'done') PredictionID = prediction['ID'] if PredictionID != 72856: continue global_scores = pickle.loads(prediction['ddG']) assert(global_scores) assert(prediction['ScoreVersion'] == 0.23) if not prediction['Scores']: raise Exception("This prediction needs to be scored with Noah's method.") gs2 = json.loads(prediction['Scores']) if True not in set([k.find('noah') != -1 for k in gs2['data'].keys()]): raise Exception("This prediction needs to be scored with Noah's method.") assert (gs2['data']['kellogg'] == global_scores['data']['kellogg']) # Pass #2: Iterate over all completed Predictions with null StructureScores. # For each Prediction, determine and store the structure scores count = 0 for prediction in predictions: count += 1 PredictionID = prediction['ID'] colortext.message('%s: %d of %d (Prediction #%d)' % (prediction_set, count, num_predictions, PredictionID)) #if PredictionID != 72856: #if PredictionID < 73045: continue if prediction['status'] == 'failed': colortext.error('Skipping failed prediction %d.' % PredictionID) continue if prediction['status'] == 'queued': colortext.warning('Skipping queued prediction %d.' % PredictionID) continue if prediction['status'] == 'postponed': colortext.printf('Skipping postponed prediction %d.' % PredictionID, 'cyan') continue # Store the ensemble scores try: global_scores = json.loads(prediction['Scores'])['data'] except: raise colortext.Exception("Failed reading the Scores field's JSON object. The Prediction Status is %(status)s. The Scores field is: '%(Scores)s'." % prediction) for score_type, inner_data in global_scores.iteritems(): for inner_score_type, data in inner_data.iteritems(): components = {} if score_type == 'kellogg' and inner_score_type == 'total': components = data['components'] ddG = data['ddG'] elif score_type == 'noah_8,0A' and inner_score_type == 'positional': ddG = data['ddG'] elif score_type == 'noah_8,0A' and inner_score_type == 'positional_twoscore': ddG = data['ddG'] elif score_type == 'noah_8,0A' and inner_score_type == 'total': ddG = data['ddG'] else: continue raise Exception('Unhandled score types: "%s", "%s".' % (score_type, inner_score_type)) ScoreMethodID = ScoreMethodMap[(score_type, inner_score_type)] new_record = dict( PredictionID = PredictionID, ScoreMethodID = ScoreMethodID, ScoreType = 'DDG', StructureID = -1, # This score is for the Prediction rather than a structure DDG = ddG, ) assert(not(set(components.keys()).intersection(set(new_record.keys())))) new_record.update(components) ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID']) if skip_if_we_have_pairs != None: # Skip this case if we have a certain number of existing records (much quicker since we do not have to extract the binary) num_wt = ddGdb.execute_select("SELECT COUNT(ID) AS NumRecords FROM PredictionStructureScore WHERE PredictionID=%s AND ScoreType='WildType'", parameters=(PredictionID,))[0]['NumRecords'] num_mut = ddGdb.execute_select("SELECT COUNT(ID) AS NumRecords FROM PredictionStructureScore WHERE PredictionID=%s AND ScoreType='Mutant'", parameters=(PredictionID,))[0]['NumRecords'] print(num_wt, num_mut) if num_wt == num_mut and num_mut == skip_if_we_have_pairs: continue # Store the ddg_monomer scores for each structure grouped_scores = DDG_api.get_ddg_monomer_scores_per_structure(PredictionID) for structure_id, wt_scores in sorted(grouped_scores['WildType'].iteritems()): new_record = dict( PredictionID = PredictionID, ScoreMethodID = ScoreMethodMap[("kellogg", "total")], ScoreType = 'WildType', StructureID = structure_id, DDG = None, ) new_record.update(wt_scores) ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID']) for structure_id, wt_scores in sorted(grouped_scores['Mutant'].iteritems()): new_record = dict( PredictionID = PredictionID, ScoreMethodID = ScoreMethodMap[("kellogg", "total")], ScoreType = 'Mutant', StructureID = structure_id, DDG = None, ) new_record.update(wt_scores) ddGdb.insertDictIfNew('PredictionStructureScore', new_record, ['PredictionID', 'ScoreMethodID', 'ScoreType', 'StructureID']) # Test to make sure that we can pick a best pair of structures (for generating a PyMOL session) assert(DDG_api.determine_best_pair(PredictionID) != None)
def _create_input_files(self): colortext.message('self.outdir: ' + self.outdir) write_file(self._filepath('scaffold.pdb'), self.Scaffold.pdb_contents) write_file(self._filepath('model.pdb'), self.Model.pdb_contents) if self.Crystal: write_file(self._filepath('crystal.pdb'), self.Crystal.pdb_contents)
rosetta_scripts_binary = sorted(release_binaries)[0] elif other_binaries: rosetta_scripts_binary = sorted(other_binaries)[0] if not rosetta_scripts_binary: raise colortext.Exception( 'No RosettaScripts binary could be located in {0}.'.format( rosetta_binary_path)) rosetta_scripts_binary = rosetta_scripts_binary[1] for c in cases: pruned_structure_directory = c[0] output_directory = c[1] # Iterate through the dataset cases colortext.message( 'Adding loop residues back to the pruned structures in {0}.'. format(pruned_structure_directory)) file_filter = os.path.join(pruned_structure_directory, '*.pdb') for pdb_file in sorted(glob.glob(file_filter)): pdb_prefix = os.path.splitext( os.path.split(pdb_file)[1])[0].lower() file_prefix = os.path.splitext(pdb_file)[0] fasta_file = file_prefix + '.fasta' loop_file = file_prefix + '.loop.json' assert (os.path.exists(fasta_file)) assert (os.path.exists(loop_file)) # Convert the FASTA headers back into PDB residue IDs fasta_contents = read_file(fasta_file) headers = [ l for l in fasta_contents.split('\n') if l.startswith('>')
pdb_chain_to_pfam_mapping[pdb_id][chain_id].add(pfam_acc) pfam_to_pdb_chain_mapping[pfam_acc] = pfam_to_pdb_chain_mapping.get(pfam_acc, set()) pfam_to_pdb_chain_mapping[pfam_acc].add(pdb_key) self.pdb_chain_to_pfam_mapping = pdb_chain_to_pfam_mapping self.pfam_to_pdb_chain_mapping = pfam_to_pdb_chain_mapping def get_pfam_accession_numbers_from_pdb_id(self, pdb_id): '''Note: an alternative is to use the RCSB API e.g. http://www.rcsb.org/pdb/rest/hmmer?structureId=1cdg.''' pdb_id = pdb_id.lower() if self.pdb_chain_to_pfam_mapping.get(pdb_id): return self.pdb_chain_to_pfam_mapping[pdb_id].copy() def get_pfam_accession_numbers_from_pdb_chain(self, pdb_id, chain): '''Note: an alternative is to use the RCSB API e.g. http://www.rcsb.org/pdb/rest/hmmer?structureId=1cdg.''' return self.pdb_chain_to_pfam_mapping.get(pdb_id.lower(), {}).get(chain) def get_pdb_chains_from_pfam_accession_number(self, pfam_acc): return self.pfam_to_pdb_chain_mapping.get(pfam_acc) if __name__ == '__main__': pfam_api = Pfam() colortext.warning(pfam_api.get_pfam_accession_numbers_from_pdb_chain('1TVA', 'A')) colortext.warning(pfam_api.get_pfam_accession_numbers_from_pdb_chain('1CDG', 'A')) colortext.warning(pfam_api.get_pfam_accession_numbers_from_pdb_id('1A2c')) colortext.message(pfam_api.get_pdb_chains_from_pfam_accession_number('PF14716'))
if __name__ == '__main__': import pprint gc = GoogleCalendar.from_file('test.json', ['main', 'rosetta_dev', 'regular_meetings', 'vacations']) tests = ['events'] #'admin' # acl if 'acl' in tests: gc.get_calendar_users('main') # calendarList if 'calendarList' in tests: gc.get_calendars() v = gc.get_calendar('vacations') colortext.message('Description: %s' % v.description) colortext.warning('Role: %s' % v.accessRole) colortext.warning('Time zone: %s' % v.timeZone) # colors if 'colors' in tests: gc.get_colors() # events if 'events' in tests: for evnt in gc.get_upcoming_events_within_the_current_month(): pass #print(evnt.datetime_o, evnt.description, evnt.location) colortext.warning('***') for evnt in gc.get_events_within_a_given_month(2014, 12):
def CreateAnalysisTables(self): ddGdb = self.ddGdb PredictionSet = self.PredictionSet predictions = PredictionScores(ddGdb, PredictionSet, self.ddG_score_type, score_cap=self.score_cap) predicted_scores = predictions.Predictions s = "Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'. " % ( predictions.NumberOfPredictions, predictions.PredictionSet.replace( "_", "\_"), predictions.UserDataSetName) if self.score_cap: s += "Running analysis over the following analysis sets: '%s' with predicted scores capped at +-%0.2f." % ( join(predictions.AnalysisSets, "', '"), self.score_cap) else: s += "Running analysis over the following analysis sets: '%s'." % ( join(predictions.AnalysisSets, "', '")) self.description.append(("black", s)) if self.quiet_level >= 1: colortext.message( "Analyzing %d predictions in PredictionSet '%s' for UserDataSet '%s'." % (predictions.NumberOfPredictions, predictions.PredictionSet, predictions.UserDataSetName)) colortext.message( "Running analysis over the following analysis sets: '%s'." % (join(predictions.AnalysisSets, "', '"))) analysis_tables = {} # Analyze data for for AnalysisSet in predictions.AnalysisSets: analysis_table = AnalysisTable() experiments = UserDataSetExperimentalScores( ddGdb, predictions.UserDataSetID, AnalysisSet) count = 0 numMissing = 0 for section, sectiondata in sorted(experiments.iteritems()): for recordnumber, record_data in sorted( sectiondata.iteritems()): count += 1 PDB_ID = record_data["PDB_ID"] ExperimentID = record_data["ExperimentID"] ExperimentalDDG = record_data["ExperimentalDDG"] if predicted_scores.get(ExperimentID) and predicted_scores[ ExperimentID].get(PDB_ID): PredictedDDG = predicted_scores[ExperimentID][PDB_ID][ "PredictedDDG"] analysis_table.add( AnalysisPoint(ExperimentalDDG, PredictedDDG, ExperimentID=ExperimentID, PDB_ID=PDB_ID, section=section, recordnumber=recordnumber)) else: numMissing += 1 if numMissing > 0 and self.quiet_level >= 1: self.description.append(( "Bittersweet", "Missing %d predictions out of %d records for analysis set %s." % (numMissing, count, AnalysisSet))) colortext.warning( "Missing %d predictions out of %d records for analysis set %s." % (numMissing, count, AnalysisSet)) analysis_tables[AnalysisSet] = analysis_table self.analysis_tables = analysis_tables
def plot(self, table_name, RFunction, output_filename=None, filetype="pdf"): '''Results is expect to be a list of dicts each of which has the keys ExperimentID and ddG.''' if (not self.analysis_tables) or (not table_name): raise Exception("There are no analysis tables to plot.") if not table_name in self.analysis_tables.keys(): raise Exception("The analysis table '%s' does not exist." % table_name) R_return_values = {} gplot = None analysis_table = self.analysis_tables[table_name] if self.quiet_level >= 3: print(table_name) print(RFunction) if len(analysis_table.points) == 1: raise Exception( "The analysis table %s set only has one data point. At least two points are required." % table_name) else: inputfname = self.CreateCSVFile(table_name) if self.quiet_level >= 3: print(inputfname) try: if self.quiet_level >= 2: colortext.printf("Running %s." % RFunction) if output_filename: colortext.printf( "Saving graph as %s with filename %s." % (filetype, output_filename)) output_fname = output_filename if not output_fname: output_fname = rosettahelper.writeTempFile(".", "") R_output = RFunction(inputfname, output_fname, filetype) R_return_values = RUtilities.parse_R_output(R_output) colortext.message(table_name) print(" %s" % str(RFunction)) for k, v in sorted(R_return_values.iteritems()): print(" %s: %s" % (str(k), str(v))) if not output_filename: contents = rosettahelper.readBinaryFile(output_fname) delete_file(output_fname) description = None for file_suffix, details in RFunctions.iteritems(): if details[1] == RFunction: description = details[0] assert (description) gplot = AnalysisObject(table_name, description, filetype, contents) else: gplot = output_filename except Exception, e: import traceback colortext.error(traceback.format_exc()) delete_file(inputfname) raise Exception(e) delete_file(inputfname)
def main(prediction_ids = None, memory_free='3.0G', cfg = None): # This uses the version of Rosetta from your cluster template settings file settings = parse_settings.get_dict() rosetta_scripts_path = settings['local_rosetta_installation_path'] + '/source/bin/' + 'rosetta_scripts' + settings['local_rosetta_binary_type'] ppi_api = get_interface_with_config_file(rosetta_scripts_path = rosetta_scripts_path, rosetta_database_path = '/home/kyleb/rosetta/working_branches/alascan/database') t1, t2 = None, None # Read the keep_hetatm_lines optional setting keep_hetatm_lines = False keep_all_lines = False try: keep_hetatm_lines = cfg.keep_hetatm_lines except: colortext.warning('Note: keep_hetatm_lines is not specified in {0}. Defaulting to {1}.'.format(sys.argv[1], keep_hetatm_lines)) try: keep_all_lines = cfg.keep_all_lines except: colortext.warning('Note: keep_all_lines is not specified in {0}. Defaulting to {1}.'.format(sys.argv[1], keep_all_lines)) prediction_set_id = cfg.prediction_set_id if prediction_ids == None: assert( len(sys.argv) > 1 ) cfg = importlib.import_module(sys.argv[1], package=None) protocol_name = cfg.protocol_name suppress_warnings = True if not ppi_api.prediction_set_exists(prediction_set_id): print 'Creating new prediction set:', prediction_set_id t1 = time.time() ppi_api.add_prediction_set(prediction_set_id, halted = True, priority = 7, allow_existing_prediction_set = False, description = cfg.prediction_set_description) # Populate the prediction set with jobs from a (tagged subset of a) user dataset print 'Created PredictionSet:', prediction_set_id ppi_api.add_prediction_run(prediction_set_id, cfg.user_dataset_name, keep_all_lines = keep_all_lines, keep_hetatm_lines = keep_hetatm_lines, tagged_subset = cfg.tagged_subset, extra_rosetta_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res', show_full_errors = True, suppress_warnings = suppress_warnings) t2 = time.time() existing_job = False end_job_name = '%s_%s' % (getpass.getuser(), prediction_set_id) if not os.path.exists(job_output_directory): os.makedirs(job_output_directory) for d in os.listdir(job_output_directory): if os.path.isdir(os.path.join(job_output_directory, d)) and end_job_name in d: print 'Found existing job:', d job_name = d existing_job = True if not existing_job: job_name = '%s-%s' % (time.strftime("%y%m%d"), end_job_name) ppi_api.add_development_protocol_command_lines( prediction_set_id, protocol_name, 'minimize_with_cst', '' ) # 2x because bugs ppi_api.add_development_protocol_command_lines( prediction_set_id, protocol_name, 'minimize_with_cst', '' ) prediction_ids = sorted(ppi_api.get_prediction_ids(prediction_set_id)) output_dir = os.path.join(job_output_directory, job_name ) else: # Prediction_ids passed in job_name = '%s-%s_%s-rerun' % (time.strftime("%y%m%d"), getpass.getuser(), prediction_set_id) output_dir = os.path.join(job_output_directory, job_name ) if os.path.isdir(output_dir): shutil.rmtree(output_dir) existing_job = False settings['scriptname'] = prediction_set_id + '_run' settings['tasks_per_process'] = 5 settings['mem_free'] = memory_free settings['output_dir'] = output_dir settings['rosetta_args_list'] = [ '-in:file:fullatom', '-ignore_zero_occupancy false', '-ignore_unrecognized_res', '-fa_max_dis 9.0', '-ddg::harmonic_ca_tether 0.5', '-ddg::constraint_weight 1.0', '-ddg::out_pdb_prefix min_cst_0.5', '-ddg::sc_min_only false', ] settings['rosetta_args_list'].extend(cfg.extra_flags) print settings['rosetta_args_list'] # Now get run settings from database and save to pickle file job_dict = {} output_data_dir = os.path.join(settings['output_dir'], 'data') if not os.path.isdir(output_data_dir): os.makedirs(output_data_dir) if t1 != None and t2 != None and len(prediction_ids) != 0: print('Time taken for {0} predictions: {1}s ({2}s per prediction).'.format(len(prediction_ids), t2-t1, (t2-t1)/len(prediction_ids))) print('File cache statistics:') pprint.pprint(ppi_api.get_file_content_cache_stats()) settings['numjobs'] = len(prediction_ids) app_name = 'minimize_with_cst' settings['appname'] = app_name print('') t1 = time.time() # Progress counter setup colortext.message('Creating input data for %d predictions.' % (len(prediction_ids))) count, records_per_dot = 0, 50 print("|" + ("*" * (int(len(prediction_ids)/records_per_dot)-2)) + "|") for prediction_id in prediction_ids: # Progress counter count += 1 if count % records_per_dot == 0: colortext.write(".", "cyan", flush = True) # Check if job already ran prediction_id_dir = os.path.join(output_dir, str(prediction_id)) if existing_job: if os.path.isdir( prediction_id_dir ): pdb_output_files = [x for x in os.listdir( prediction_id_dir ) if '.pdb' in x] else: pdb_output_files = [] if len(pdb_output_files) >= 1: print 'Skipping', prediction_id settings['numjobs'] = settings['numjobs'] - 1 continue if os.path.isdir(prediction_id_dir): print 'Job directory %s already exists, deleting' % prediction_id_dir shutil.rmtree(prediction_id_dir) # else: # print 'Creating new job directory %s' % prediction_id_dir job_data_dir = os.path.join(output_data_dir, str(prediction_id)) # Allow us to resume from an interrupted setup truncate_content = None all_files_exist = os.path.exists(job_data_dir) and os.path.exists(os.path.join(job_data_dir, '.ready')) if all_files_exist: truncate_content = 0 job_details = ppi_api.get_job_details(prediction_id, truncate_content = truncate_content) file_tuples = [] # List of names, contents for file_info in job_details['Files']['Input']: file_tuples.append( (file_info['Filename'], file_info['Content']) ) substitution_parameters = json.loads(job_details['JSONParameters']) # Scrub the folder if not all_files_exist: if os.path.isdir(job_data_dir): shutil.rmtree(job_data_dir) os.makedirs(job_data_dir) files_dict = {} # Maps name to filepath position for file_name, file_contents in file_tuples: new_file_location = os.path.join(job_data_dir, file_name) if not all_files_exist: if '.pdb' in file_name: if keep_hetatm_lines or keep_all_lines: write_file(new_file_location, file_contents) else: write_file(new_file_location, '\n'.join([l for l in file_contents.split('\n') if l.startswith('ATOM')])) else: with open(new_file_location, 'w') as f: f.write(file_contents) files_dict[file_name] = os.path.relpath(new_file_location, settings['output_dir']) if not all_files_exist: write_file(os.path.join(job_data_dir, '.ready'), '') argdict = { 'input_file_list' : [files_dict[substitution_parameters['%%input_pdb%%']]], } for file_name, file_location in files_dict.iteritems(): if 'params' in file_name: argdict['-extra_res_fa'] = file_location job_dict[prediction_id] = argdict t2 = time.time() print('') if count != 0: print('Time taken for {0} predictions: {1}s ({2}s per prediction).'.format(count, t2-t1, (t2-t1)/count)) print('File cache statistics:') pprint.pprint(ppi_api.get_file_content_cache_stats()) print('') if len(job_dict) > 0: write_run_file(settings, database_run = False, job_dict = job_dict) print 'Job files written to directory:', os.path.abspath(output_dir) else: print 'No tasks to process, not writing job files'
#sys.path.insert(0, '/home/oconchus/dev/') #sys.path.insert(0, "/home/oconchus/dev/klab") else: import klab import klab.colortext as colortext from ddglib.ppi_api import get_interface_with_config_file as get_ppi_interface_with_config_file # Set up database connection try: ppi_api = get_ppi_interface_with_config_file(host_config_name = 'kortemmelab') except: colortext.error('Database connection failed.') raise colortext.message('Connected to database.') # Pick a scoring method score_method_id = ppi_api.get_score_method_id('Rescore-Talaris2014', method_authors = 'kyle', method_type = 'ddg_monomer rescore') # Get the best structures for prediction 23849 wild_type_complexes = ppi_api.get_top_x_scores(23849, score_method_id, 'WildTypeComplex', 3, component = 'total', order_by = 'ASC') wild_type_filenames = [] for wtc in wild_type_complexes: wild_type_filenames.append([f for f in glob.glob('repacked_wt*_round_{0}.*'.format(wtc['StructureID']))][0]) print(wild_type_filenames) mutant_complexes = ppi_api.get_top_x_scores(23849, score_method_id, 'MutantComplex', 3, component = 'total', order_by = 'ASC') mutant_filenames = []
def generate_JSON_dataset(dataset_ID, pdb_data, pub_data): record_data = {} #1LRP #1LMB # 1 JSON object per dataset record failure_count = 0 records = ddGdb.execute_select('SELECT * FROM DataSetDDG WHERE DataSetID=%s', parameters=(dataset_ID,)) colortext.warning('Starting with %d records.' % (len(records))) mutation_count = {1:0, 2:0, 3:0, 4:0, 5:0} for r in records: mutation_is_reversed = r['MutationIsReversed'] == 1 d = dict( _DataSetDDGID = r['ID'], RecordID = r['RecordNumber'], AggregateType = r['AggregateType'], DDG = r['PublishedValue'], PDBFileID = r['PDBFileID'], DerivedMutation = mutation_is_reversed, ) # Parse PDB if not(cached_pdbs.get(r['PDBFileID'])): cached_pdbs[r['PDBFileID']] = PDB(ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(r['PDBFileID'],))[0]['Content']) # Store PDB data PDBResolution = None, PDBMethodOfDetermination = None, try: PDBResolution = cached_pdbs[r['PDBFileID']].get_resolution() except: pass try: PDBMethodOfDetermination = cached_pdbs[r['PDBFileID']].get_techniques() except: pass pdb_data[r['PDBFileID']] = dict( Resolution = PDBResolution, MethodOfDetermination = PDBMethodOfDetermination, ) assay_DDGs = ddGdb.execute_select(''' SELECT * FROM DataSetDDGSource INNER JOIN ExperimentAssayDDG ON DataSetDDGSource.ExperimentAssayID = ExperimentAssayDDG.ExperimentAssayID AND DataSetDDGSource.Type = ExperimentAssayDDG.Type INNER JOIN ExperimentAssay ON ExperimentAssayDDG.ExperimentAssayID = ExperimentAssay.ID WHERE DataSetDDGID=%s''', parameters=(r['ID'],)) ExperimentID = set([a['ExperimentID'] for a in assay_DDGs]) if len(ExperimentID) != 1: colortext.message('%d records passed' % len(record_data)) # Cases where 1FLV and 1FTG need to be elided if sorted(ExperimentID) in ([113699, 113830], [113704, 113832], [113705, 113836]): ExperimentID = [sorted(ExperimentID)[0]] elif sorted(ExperimentID) in ([112149, 112591],): # ExperimentID is used below for mutation details but these agree in this case. 1LZ1, 2BQA ExperimentID = [sorted(ExperimentID)[0]] elif sorted(ExperimentID) in ( [112141, 112583L], [112136, 112578], [112137, 112579], [112142, 112584], [112139, 112581], [112140, 112582], [112146, 112588], [112147, 112589], [112148, 112590] ): # ExperimentID is used below for mutation details but these agree in this case. 1REX, 2BQA ExperimentID = [sorted(ExperimentID)[0]] elif sorted(ExperimentID) in ([112227, 112323], [112288, 113039], [111587, 112379]): # ExperimentID is used below for mutation details but these agree in this case. 2LZM, 1L63 ExperimentID = [sorted(ExperimentID)[0]] else: colortext.warning( '\n'.join(['%(PDBFileID)s %(Chain)s %(WildTypeAA)s %(ResidueID)s %(MutantAA)s' % rii for rii in ddGdb.execute_select(''' SELECT * FROM `ExperimentMutation` INNER JOIN Experiment ON Experiment.ID=ExperimentID WHERE `ExperimentID` IN (%s)''' % ','.join(map(str, ExperimentID)))])) pprint.pprint(r) colortext.error(map(int, ExperimentID)) #pprint.pprint(assay_DDGs) print(sorted(ExperimentID)) assert(len(ExperimentID) == 1) ExperimentID = ExperimentID.pop() d['_ExperimentID'] = ExperimentID experimental_DDGs = [] for a in assay_DDGs: experimental_DDGs.append(dict( DDG = a['Value'], DDGType = a['Type'], Publication = a['Publication'], LocationOfValueInPublication = a['LocationOfValueInPublication'], Temperature = a['Temperature'], pH= a['pH'], )) # Store Publication data pub_data[a['Publication']] = cached_publications[a['Publication']] d['ExperimentalDDGs'] = experimental_DDGs # Retrieve mutations mutation_records = ddGdb.execute_select('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s ORDER BY ResidueID', parameters=(ExperimentID,)) if dataset_ID == "AlaScan-GPK_2014/09/25": assert(len(mutation_records) == 1) mutations = [] failed_check = False mutation_count[len(mutation_records)] += 1 for mutation in mutation_records: mutation_d = {} #if ExperimentID == 109911: # d['PDBFileID'] = '1WQ5' # Hack for one 1BKS case mutation_d['Chain'] = mutation['Chain'] mutation_d['ResidueID'] = mutation['ResidueID'] if mutation_is_reversed: mutation_d['MutantAA'] = mutation['WildTypeAA'] mutation_d['WildTypeAA'] = mutation['MutantAA'] else: mutation_d['WildTypeAA'] = mutation['WildTypeAA'] mutation_d['MutantAA'] = mutation['MutantAA'] if dataset_ID == "AlaScan-GPK_2014/09/25": if d['PDBFileID'] == '1LMB': mutation_d['Chain'] = '3' # Hack for the PDB replacement 1LRP (3.2A) -> 1LMB (1.8A) if d['PDBFileID'] == '1U5P' and int(mutation_d['ResidueID']) < 1600: mutation_d['ResidueID'] = str(int(mutation_d['ResidueID']) + 1762) # Hack for the PDB replacement 1AJ3, NMR -> 1U5P (2A) if dataset_ID == "Kellogg_10.1002/prot.22921_2010/12/03": if d['PDBFileID'] == '1U5P' and int(mutation_d['ResidueID']) < 1600: mutation_d['ResidueID'] = str(int(mutation_d['ResidueID']) + 1762) # Hack for the PDB replacement 1AJ3, NMR -> 1U5P (2A) mutated_residue = ddGdb.execute_select('SELECT * FROM PDBResidue WHERE PDBFileID=%s AND Chain=%s AND ResidueID=%s', parameters=(d['PDBFileID'], mutation_d['Chain'], ResidueID2String(mutation_d['ResidueID']))) if len(mutated_residue) == 0: colortext.warning('Skipping Experiment #%d (%s) in %s due to missing residue %s.' % (ExperimentID, d['PDBFileID'], dataset_ID, mutation_d['ResidueID'])) #print('SELECT * FROM PDBResidue WHERE PDBFileID=%s AND Chain=%s AND ResidueID=%s' % (d['PDBFileID'], mutation_d['Chain'], ResidueID2String(mutation_d['ResidueID']))) #pprint.pprint(d) #pprint.pprint(mutations) #pprint.pprint(mutation_d) #print(ExperimentID) #print(mutated_residue) #print(10*'*') #print('\n') failure_count += 1 failed_check = True break assert(len(mutated_residue) == 1) mutated_residue = mutated_residue[0] mutation_d['DSSPExposure'] = mutated_residue['MonomericExposure'] mutation_d['DSSPType'] = mutated_residue['MonomericDSSP'] mutation_d['DSSPSimpleSSType'] = dssp_elision.get(mutation_d['DSSPType']) assert(mutation_d['DSSPType'] != None) assert(mutation_d['DSSPSimpleSSType'] != None) mutations.append(mutation_d) if failed_check: print('FAILED CHECK') continue d['Mutations'] = mutations if dataset_ID == "Potapov_10.1093/protein/gzp030_2009/09/01": key = '%s_%s_%s' % (d['PDBFileID'], '+'.join(['%s:%s:%s' % (mutation_d['Chain'], mutation_d['ResidueID'].strip(), mutation_d['MutantAA']) for mutation_d in mutations]), d['RecordID']) else: key = '%s_%s' % (d['PDBFileID'], '+'.join(['%s:%s:%s' % (mutation_d['Chain'], mutation_d['ResidueID'].strip(), mutation_d['MutantAA']) for mutation_d in mutations])) if record_data.get(key): colortext.warning('KEY EXISTS: %s' % key) print('Existing record: %s' % pprint.pformat(record_data[key])) print('New record: %s' % pprint.pformat(d)) failure_count += 1 record_data[key] = d colortext.message('Mutation count') colortext.warning(pprint.pformat(mutation_count)) if failure_count > 0: colortext.error('Total length of dataset: %d. Failed on %d records.' % (len(record_data), failure_count)) else: colortext.message('Total length of dataset: %d. ' % (len(record_data))) record_list = [] for k, v in sorted(record_data.iteritems()): record_list.append(v) colortext.message('Adding dataset %s with %d records, %d PDB files, and %d references.' % (dataset_ID, len(record_list), len(pdb_data), len(pub_data))) JSON_datasets[dataset_ID]['data'] = record_list
def check_existing_complexes_by_name(): '''Check whether any of the complexes exist in the database.''' # Ran is short for "RAs-related Nuclear protein" and is also known as "GTP-binding nuclear protein Ran" ppi_api = get_ppi_api() ids = ppi_api.get_complex_ids_matching_protein_name('gsp') ids.extend(ppi_api.get_complex_ids_matching_protein_name('ran')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('ras')) # This gives us these complexes, amongst others: # # 77 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Importin beta-1 subunit, Importin β1, Importin β1 # # 119 # Ran GTPase, Ran GTPase, Ran GTPase # Ran GAP, Ran GAP, Ran GAP # # 176 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Regulator of chromosome condensation, RCC1, RCC1 # # 202 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Nuclear transport factor 2, NTF2, NTF2 # # 29 # Ras GTPase.GDP, Ras GTPase.GDP, Ras GTPase.GDP # Ras GAP, Ras GAP, Ras GAP # # 65 # Ras GTPase.GTP, H-Ras, H-Ras # Son of sevenless-1, Sos, Sos # # 201 # Ras GTPase, Ras GTPase, Ras GTPase # Phosphoinositide 3-kinase, PI3K, PI3K # # 280 # Ras.GNP, Ras.GNP, Ras.GNP # RalGDS Ras-interacting domain, RalGDS RID, RalGDS RID ids = [] ids.extend(ppi_api.get_complex_ids_matching_protein_name('importin')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('KARYOPHERIN')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('TRANSPORTIN')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('NTF2')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('YRB1P')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RANBP1')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('EXP5')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('CSE1')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RANGAP')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RANBP2')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RCC1')) for id in ids: d = ppi_api.get_complex_details(id) colortext.warning(id) print('{0}, {1}, {2}'.format(d['LName'].encode('utf-8').strip(), d['LShortName'].encode('utf-8').strip(), d['LHTMLName'].encode('utf-8').strip())) print('{0}, {1}, {2}'.format(d['RName'].encode('utf-8').strip(), d['RShortName'].encode('utf-8').strip(), d['RHTMLName'].encode('utf-8').strip())) # This gives us these complexes: # # 77 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Importin beta-1 subunit, Importin β1, Importin β1 # # 202 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Nuclear transport factor 2, NTF2, NTF2 # # 176 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Regulator of chromosome condensation, RCC1, RCC1 # # SELECT DISTINCT `PDBFileID` FROM `PPIPDBPartnerChain` WHERE `PPComplexID` IN (77, 202, 176) # returns # 1F59, 1IBR, 1QG4, 1A12, 1OUN and 1I2M, 1A2K # # Some of these are unbound. Get the complexes: # # SELECT DISTINCT `PDBFileID` FROM `PPIPDBPartnerChain` # INNER JOIN PPIPDBSet ON PPIPDBPartnerChain.PPComplexID=PPIPDBSet.PPComplexID AND PPIPDBPartnerChain.SetNumber=PPIPDBSet.SetNumber # WHERE PPIPDBPartnerChain.PPComplexID IN (77, 202, 176) AND IsComplex=1 # # returns only three hits: # complex #77 -> 1IBR (A|B); # complex #176 -> 1I2M (A|B) where Tina uses A|B (chains may be renamed); and # complex #202 -> 1A2K (C|AB) where Tina uses A|B (chains may be renamed). # # We also have: # complex #119 -> 1K5D (AB|C) where Tina uses A|B # # 1IBR -> Ran (human)|Importin β1 (human) # Tina has: # 2BKU -> RAN (dog)|Importin β1 (yeast) # 3EA5 -> RAN (human)|Importin β1 (yeast) # 3EA5 and 1IBR do not match on chains B at all and have one mutation in chain A # Similarly for 2BKU and 1IBR. # # However what came out of this is that 3EA5 and 2BKU are related i.e. that RAN is almost the same sequence in both. # The only difference is one mutation in chain A: index 40, A->P and that 3EA5 has a longer sequence for chain A # colortext.message('\n\n1IBR') p1 = PDB(retrieve_pdb('1IBR')) pprint.pprint(p1.seqres_sequences) colortext.message('\n\n2BKU') p2 = PDB(retrieve_pdb('2BKU')) pprint.pprint(p2.seqres_sequences) a1 = str(p1.seqres_sequences['A']) a2 = str(p2.seqres_sequences['A']) #3EA5 a1 = 'MAAQGEPQVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFHTNRGPIKFNVWDTAGQEKFGGLRDGYYIQAQCAIIMFDVTSRVTYKNVPNWHRDLVRVCENIPIVLCGNKVDIKDRKVKAKSIVFHRKKNLQYYDISAKSNYNFEKPFLWLARKLIGDPNLEFVAMPCLAPPEVVMDPALAAQYEHDLEVAQTTALPDEDDDL' a1 = 'MSTAEFAQLLENSILSPDQNIRLTSETQLKKLSNDNFLQFAGLSSQVLIDENTKLEGRILAALTLKNELVSKDSVKTQQFAQRWITQVSPEAKNQIKTNALTALVSIEPRIANAAAQLIAAIADIELPHGAWPELMKIMVDNTGAEQPENVKRASLLALGYMCESADPQSQALVSSSNNILIAIVQGAQSTETSKAVRLAALNALADSLIFIKNNMEREGERNYLMQVVCEATQAEDIEVQAAAFGCLCKIMSKYYTFMKPYMEQALYALTIATMKSPNDKVASMTVEFWSTICEEEIDIAYELAQFPQSPLQSYNFALSSIKDVVPNLLNLLTRQNEDPEDDDWNVSMSAGACLQLFAQNCGNHILEPVLEFVEQNITADNWRNREAAVMAFGSIMDGPDKVQRTYYVHQALPSILNLMNDQSLQVKETTAWCIGRIADSVAESIDPQQHLPGVVQACLIGLQDHPKVATNCSWTIINLVEQLAEATPSPIYNFYPALVDGLIGAANRIDNEFNARASAFSALTTMVEYATDTVAETSASISTFVMDKLGQTMSVDENQLTLEDAQSLQELQSNILTVLAAVIRKSPSSVEPVADMLMGLFFRLLEKKDSAFIEDDVFYAISALAASLGKGFEKYLETFSPYLLKALNQVDSPVSITAVGFIADISNSLEEDFRRYSDAMMNVLAQMISNPNARRELKPAVLSVFGDIASNIGADFIPYLNDIMALCVAAQNTKPENGTLEALDYQIKVLEAVLDAYVGIVAGLHDKPEALFPYVGTIFQFIAQVAEDPQLYSEDATSRAAVGLIGDIAAMFPDGSIKQFYGQDWVIDYIKRTRSGQLFSQATKDTARWAREQQKRQLSL' #2BKU a2 = 'MAAQGEPQVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVPTLGVEVHPLVFHTNRGPIKFNVWDTAGQEKFGGLRDGYYIQAQCAIIMFDVTSRVTYKNVPNWHRDLVRVCENIPIVLCGNKVDIKDRKVKAKSIVFHRKKNLQYYDISAKSNYNFEKPFLWLARKLIGDPNLEFV' a2 = 'MSTAEFAQLLENSILSPDQNIRLTSETQLKKLSNDNFLQFAGLSSQVLIDENTKLEGRILAALTLKNELVSKDSVKTQQFAQRWITQVSPEAKNQIKTNALTALVSIEPRIANAAAQLIAAIADIELPHGAWPELMKIMVDNTGAEQPENVKRASLLALGYMCESADPQSQALVSSSNNILIAIVQGAQSTETSKAVRLAALNALADSLIFIKNNMEREGERNYLMQVVCEATQAEDIEVQAAAFGCLCKIMSKYYTFMKPYMEQALYALTIATMKSPNDKVASMTVEFWSTICEEEIDIAYELAQFPQSPLQSYNFALSSIKDVVPNLLNLLTRQNEDPEDDDWNVSMSAGACLQLFAQNCGNHILEPVLEFVEQNITADNWRNREAAVMAFGSIMDGPDKVQRTYYVHQALPSILNLMNDQSLQVKETTAWCIGRIADSVAESIDPQQHLPGVVQACLIGLQDHPKVATNCSWTIINLVEQLAEATPSPIYNFYPALVDGLIGAANRIDNEFNARASAFSALTTMVEYATDTVAETSASISTFVMDKLGQTMSVDENQLTLEDAQSLQELQSNILTVLAAVIRKSPSSVEPVADMLMGLFFRLLEKKDSAFIEDDVFYAISALAASLGKGFEKYLETFSPYLLKALNQVDSPVSITAVGFIADISNSLEEDFRRYSDAMMNVLAQMISNPNARRELKPAVLSVFGDIASNIGADFIPYLNDIMALCVAAQNTKPENGTLEALDYQIKVLEAVLDAYVGIVAGLHDKPEALFPYVGTIFQFIAQVAEDPQLYSEDATSRAAVGLIGDIAAMFPDGSIKQFYGQDWVIDYIKRTRSGQLFSQATKDTARWAREQQKRQLSL' print(a1 == a2) if not a1 == a2: # horribly inefficient (casting to str each time) but not worth rewriting assert(len(a1) == len(a2)) for x in range(len(a1)): if str(a1)[x] != str(a2)[x]: print(x, str(a1)[x], str(a2)[x]) # one mutation A->C near the end of the sequence: VAMPALAP -> VAMPCLAP assert(str(p1.seqres_sequences['A']) == str(p1.seqres_sequences['C'])) assert(str(p1.seqres_sequences['B']) == str(p1.seqres_sequences['D'])) assert(str(p2.seqres_sequences['A']) == str(p2.seqres_sequences['C'])) assert(str(p2.seqres_sequences['B']) == str(p2.seqres_sequences['D'])) print('')
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]): max_processors = get_number_of_processors() rescore_process_file = "/tmp/klab_rescore.txt" parser = OptionParser() parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES") parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID") parser.add_option("-d", "--delete", action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file) parser.add_option("-s", "--set", type='string', dest="prediction_set", help="The prediction set to rescore.") (options, args) = parser.parse_args() if options.delete and os.path.exists(rescore_process_file): print("Removing %s." % rescore_process_file) os.remove(rescore_process_file) num_processes = options.num_processes prediction_set = options.prediction_set process_id = options.process for i in FixedIDs: assert(type(i) == type(1)) # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime' AND Status='done' LIMIT 1 # Check prediction set if not prediction_set: raise colortext.Exception("A prediction set must be specified.") else: if FixedIDs: results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs))) if len(results) != 1: raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results)) else: results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,)) if not results: raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set) if num_processes < 1: raise colortext.Exception("At least 1 processor must be used.") if num_processes > max_processors: raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes)) if num_processes > (max_processors * 0.75): colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors)))) if not(1 <= process_id <= min(max_processors, num_processes)): raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes)) if os.path.exists(rescore_process_file): lines = readFileLines(rescore_process_file) idx = lines[0].find("numprocesses") if idx == -1: raise Exception("Badly formatted %s." % rescore_process_file) existing_num_processes = int(lines[0][idx+len("numprocesses"):]) if existing_num_processes != num_processes: raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes)) for line in [line for line in lines[1:] if line.strip()]: idx = line.find("process") if idx == -1: raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line)) existing_process = int(line[idx+len('process'):]) if process_id == existing_process: raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file)) F = open(rescore_process_file, 'a') F.write("process %d\n" % process_id) F.close() else: F = open(rescore_process_file, 'w') F.write("numprocesses %d\n" % num_processes) F.write("process %d\n" % process_id) F.close() output_dir = os.path.join('rescoring', str(process_id)) if not(os.path.exists(output_dir)): os.makedirs(output_dir) abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir)) print("Running process in %s.\n" % abs_output_dir) ReallyFixedIDs = False results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),)) if not(FixedIDs) and results: raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision) else: # Hacky way to run multiple processes if ReallyFixedIDs: num_to_score = len(remaining_unscored) num_for_this_to_score = num_to_score / num_processes IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score] results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score)))) elif FixedIDs: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1)) else: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1)) count = 0 cases_computed = 0 total_time_in_secs = 0 number_of_cases_left = len(results) * len(radii) failed_cases = [] colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen') for r in results: t = Timer() t.add('Preamble') inner_count = 0 mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],)) mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations]) extracted_data = False details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID'])) experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID'] print('Experiment PDB file ID = %s' % experiment_pdbID) pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID'] print('UserDataSetExperiment PDB file ID = %s' % pdbID) count += 1 if True:#len(mutations) == 1: timestart = time.time() #mutation = mutations[0] dbchains = sorted(set([mutation['Chain'] for mutation in mutations])) # todo: note: assuming monomeric structures here assert(len(dbchains) == 1) dbchain = dbchains[0] #mutantaa = mutation['MutantAA'] ddG_dict = json.loads(r['Scores']) kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG'] #assert(ddG_dict['version'] == current_score_revision) all_done = True for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if not(ddG_dict['data'].get(score_name)): all_done = False else: cases_computed += 1 number_of_cases_left -= 1 if all_done: print('Prediction %d: done.' % r["ID"]) continue # Extract data t.add('Grab data') #archivefile = None #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value'] #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID']) #print(job_data_path) #assert(os.path.exists(job_data_path)) #archivefile = readBinaryFile(job_data_path) archivefile = DDG_interface.getData(r['ID']) zipfilename = os.path.join(output_dir, "%d.zip" % r['ID']) F = open(zipfilename, "wb") F.write(archivefile) F.close() t.add('Extract data') zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED) tmpdir = None repacked_files = [] mutant_files = [] rosetta_resids = [] try: tmpdir = makeTemp755Directory(output_dir) highestIndex = -1 foundResfile = False foundMutfile = False presumed_mutation = None for fname in sorted(zipped_content.namelist()): if fname.endswith(".pdb"): if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']): structnum = int(fname[fname.rindex('_')+1:-4]) if fname.startswith("%s/mut_" % r['ID']): if presumed_mutation: assert(presumed_mutation == os.path.split(fname)[1].split('_')[1]) else: presumed_mutation = os.path.split(fname)[1].split('_')[1] newfname = 'mutant_%02d' % structnum if fname.startswith("%s/repacked_" % r['ID']): newfname = 'repacked_%02d' % structnum highestIndex = max(highestIndex, structnum) newfilepath = os.path.join(tmpdir, newfname) writeFile(newfilepath, zipped_content.read(fname)) if fname.startswith("%s/mut_" % r['ID']): mutant_files.append(newfilepath) if fname.startswith("%s/repacked_" % r['ID']): repacked_files.append(newfilepath) #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']): # writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname)) if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.') foundResfile = True lines = zipped_content.read(fname).split("\n") assert(len(lines) == 3) assert(lines[0] == "NATAA") assert(lines[1] == "start") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 4) rosetta_resid = resfile_mutation[0] rosetta_chain = resfile_mutation[1] rosetta_mutaa = resfile_mutation[3] assert(mutantaa == rosetta_mutaa) assert(dbchain == rosetta_chain) assert(resfile_mutation[2] == 'PIKAA') assert(len(rosetta_mutaa) == 1) if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): foundMutfile = True lines = zipped_content.read(fname).split("\n") assert(lines[0].startswith('total ')) num_mutations = int(lines[0][6:]) assert(lines[1] == str(num_mutations)) # todo: note: assuming monomeric structures here rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] resfile_mutations = lines[2:] for resfile_mutation in resfile_mutations: resfile_mutation = resfile_mutation.split(" ") assert(len(resfile_mutation) == 3) rosetta_resids.append(resfile_mutation[1]) rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) # Make sure the wtaa->mutantaa types match the structures assert(not(foundResfile)) if not foundMutfile: raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.') input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],)) assert(len(input_files) == 1) lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n") #lines = regenerate_mutfile(r['ID']).split("\n") assert(len(lines) == 3) assert(lines[0] == "total 1") assert(lines[1] == "1") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 3) rosetta_resid = resfile_mutation[1] rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation) fullresids = [] for rosetta_resid in rosetta_resids: fullresid = None if rosetta_resid.isdigit(): fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid) else: assert(False) fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid) fullresids.append(fullresid) resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],)) assert(len(resultst1) == 1) ExperimentIDt1 = resultst1[0]['ExperimentID'] UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID'] if UserDataSetExperimentIDt1: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,)) else: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,)) assert(len(resultst2) == 1) prediction_PDB_ID = resultst2[0]['PDBFileID'] if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']: for fullresid in fullresids: wtaa = None for m in mutations: # Hack for ub_RPN13 if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_RPN13_yeast elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_OTU elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_OTU_yeast elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_UQcon elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501 wtaa = m['WildTypeAA'] # Hack for uby_UQcon elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287): wtaa = m['WildTypeAA'] elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip(): wtaa = m['WildTypeAA'] if (wtaa == None): colortext.error(prediction_PDB_ID) colortext.error('wtaa == None') colortext.error('fullresid = %s' % str(fullresid)) colortext.error(str(mutations)) colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids]) #sys.exit(0) assert(wtaa != None) assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa) #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa) for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if ddG_dict['data'].get(score_name): print('Radius %0.1f: done.' % radius) continue cases_computed += 1 number_of_cases_left -= 1 t.add('Radius %0.3f: repacked' % radius) colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange') repacked_score = NoahScore() repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.message("Repacked") print(repacked_score) t.add('Radius %0.3f: mutant' % radius) mutant_score = NoahScore() mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.printf("Mutant", color = 'cyan') print(mutant_score) t.add('Radius %0.3f: postamble' % radius) colortext.printf("ddG", color = 'lightpurple') ddg_score = repacked_score.ddg(mutant_score) print(ddg_score) colortext.printf("Liz's ddG", color = 'yellow') print("Total score: %0.3f" % kellogg_ddG) ddG_dict['version'] = '0.23' if ddG_dict['version'] == '0.1': ddG_dict['version'] = '0.21' ddG_dict['data'] = { 'kellogg' : { 'total' : ddG_dict['data'], }, 'noah': { 'total' : {'ddG' : ddg_score.total}, 'positional' : {'ddG' : ddg_score.positional}, 'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore}, }, } elif ddG_dict['version'] == '0.2': ddG_dict['version'] = '0.21' ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.22': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.23': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore jsonified_ddG = json.dumps(ddG_dict) ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],)) t.add('Cleanup') shutil.rmtree(tmpdir) os.remove(zipfilename) except Exception, e: print("Exception! In prediction %d" % r['ID'], str(e)) failed_cases.append(r['ID']) import traceback print(traceback.format_exc()) if tmpdir: shutil.rmtree(tmpdir) total_time_in_secs += t.sum() average_time_taken = float(total_time_in_secs)/float(cases_computed or 1) estimate_remaining_time = number_of_cases_left * average_time_taken t.stop() colortext.printf("**Profile**", 'orange') print(t) colortext.message("Time taken for this case: %0.2fs." % t.sum()) colortext.message("Average time taken per case: %0.2fs." % average_time_taken) colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60)) print("\n")
def printAllEvents(self, calendar_id, year = None): colortext.message('Events on Calendar: %s' % (self.get_calendar(calendar_id).summary)) eventstbl = self.getEventsTable(calendar_id, year) for startdateTitle, details in sorted(eventstbl.iteritems()): startdate = startdateTitle[0] print(("%s -> %s at %s: %s" % (startdate, details["enddate"], details["location"][0:details["location"].find("@")], details["title"])).encode('ascii', 'ignore'))