def setup(): global pdb_file_paths # RCSB PDB_ID -> PDB file global rcsb_pdb_objects # RCSB PDB_ID -> PDB object global tina_pdb_objects # Tina's PDB_ID -> PDB object global tina_pdb_id_to_rcsb_pdb_id # Tina's PDB_ID -> RCSB PDB_ID global mutations_dataframe if not mutations_dataframe: setup_mutations_dataframe() # old_mutations_csv is missing some cases but has the mapping from pdb -> partner 1 name, partner 2 name old_mutations_csv = os.path.join('temp', 'mutations_Gsp1_old.txt') assert(os.path.exists('temp')) assert(os.path.exists(old_mutations_csv)) df = pandas.read_csv(old_mutations_csv, sep = '\t') tina_pdb_ids = sorted(set([p for p in df['pdb'].values])) rcsb_pdb_ids = set() for pdb_id in tina_pdb_ids: rcsb_pdb_ids.add(pdb_id[:4]) tina_pdb_id_to_rcsb_pdb_id[pdb_id] = pdb_id[:4] rcsb_pdb_ids = sorted(rcsb_pdb_ids) assert(rcsb_pdb_ids == sorted(set([p[:4] for p in mutations_dataframe['pdb'].values]))) rcsb_file_dir = '../../rawdata' for pdb_id in tina_pdb_ids: tina_pdb_objects[pdb_id] = PDB.from_filepath(os.path.join('temp', 'pdbs', '{0}.pdb'.format(pdb_id)), parse_ligands = True) for pdb_id in rcsb_pdb_ids: filename = '{0}.pdb'.format(pdb_id.upper()) pdb_file_paths[pdb_id.upper()] = os.path.join(rcsb_file_dir, filename) pdb_contents = download_pdb(pdb_id, rcsb_file_dir, silent = True, filename = filename) p = PDB(pdb_contents, parse_ligands = True) rcsb_pdb_objects[pdb_id] = p print('\nRosetta files ({0}) : {1}'.format(str(len(tina_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in tina_pdb_ids]))) print('Original files ({0}) : {1}\n'.format(str(len(rcsb_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in rcsb_pdb_ids]))) ppi_api = get_ppi_api() for pdb_id, pdb_file_path in pdb_file_paths.iteritems(): existing_records = ppi_api.DDG_db.execute_select('SELECT * FROM PDBFile WHERE ID=%s', parameters=(pdb_id,)) if existing_records: colortext.warning('The PDB file {0} exists in the database.'.format(pdb_id)) complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id) if complex_ids: colortext.warning('The PDB file {0} has associated complexes: {1}'.format(pdb_id, ', '.join(map(str, complex_ids)))) print('')
def get_kic_run_details(output_directory, pdb_id, loop_sets, test_mode = False): '''This function returns the details required to set up the analysis for the Rosetta KIC and NGK methods.''' details = [] c = 0 for sc_file in glob.glob(os.path.join(output_directory, '{0}*.sc'.format(pdb_id))): # Determine the id sc_filename = os.path.split(sc_file)[1] assert(sc_filename.startswith('{0}_score'.format(pdb_id))) run_id = int(sc_filename[10:-3]) # Determine the score sc_lines = [l.strip() for l in get_file_lines(sc_file) if l.strip()] assert(sc_lines[0] == 'SEQUENCE:') assert(sc_lines[1].split()[:2] == ['SCORE:', 'total_score']) assert(sc_lines[2].split()[0] == 'SCORE:') total_score = float(sc_lines[2].split()[1]) # Determine the filepath of the predicted structure associated_pdb_file = os.path.join(output_directory, '{0}_{0}{1}_0001.pdb'.format(pdb_id, run_id)) # Extract the PDB coordinates into a pandas dataframe (HDF5 format) assert(os.path.exists(associated_pdb_file)) hdf5_file = os.path.splitext(associated_pdb_file)[0] + '.hdf5' if os.path.exists(hdf5_file): store = pandas.HDFStore(hdf5_file) pdb_loop_residue_matrix = store['dataframe'] store.close() else: pdb_loop_residue_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(associated_pdb_file).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) store = pandas.HDFStore(hdf5_file) store['dataframe'] = pdb_loop_residue_matrix store.close() details.append(dict( id = run_id, score = total_score, predicted_structure = associated_pdb_file, pdb_loop_residue_matrix = pdb_loop_residue_matrix, )) if test_mode: c += 1 if c >= 10: break return details
def static_get_pdb_object(pdb_id, bio_cache = None, cache_dir = None): '''This method does not necessarily use a BioCache but it seems to fit here.''' pdb_id = pdb_id.upper() if bio_cache: return bio_cache.get_pdb_object(pdb_id) if cache_dir: # Check to see whether we have a cached copy of the PDB file filepath = os.path.join(cache_dir, '{0}.pdb'.format(pdb_id)) if os.path.exists(filepath): return PDB.from_filepath(filepath) # Get any missing files from the RCSB and create cached copies if appropriate pdb_contents = retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) return PDB(pdb_contents)
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]): max_processors = get_number_of_processors() rescore_process_file = "/tmp/klab_rescore.txt" parser = OptionParser() parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES") parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID") parser.add_option("-d", "--delete", action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file) parser.add_option("-s", "--set", type='string', dest="prediction_set", help="The prediction set to rescore.") (options, args) = parser.parse_args() if options.delete and os.path.exists(rescore_process_file): print("Removing %s." % rescore_process_file) os.remove(rescore_process_file) num_processes = options.num_processes prediction_set = options.prediction_set process_id = options.process for i in FixedIDs: assert(type(i) == type(1)) # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime' AND Status='done' LIMIT 1 # Check prediction set if not prediction_set: raise colortext.Exception("A prediction set must be specified.") else: if FixedIDs: results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs))) if len(results) != 1: raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results)) else: results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,)) if not results: raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set) if num_processes < 1: raise colortext.Exception("At least 1 processor must be used.") if num_processes > max_processors: raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes)) if num_processes > (max_processors * 0.75): colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors)))) if not(1 <= process_id <= min(max_processors, num_processes)): raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes)) if os.path.exists(rescore_process_file): lines = readFileLines(rescore_process_file) idx = lines[0].find("numprocesses") if idx == -1: raise Exception("Badly formatted %s." % rescore_process_file) existing_num_processes = int(lines[0][idx+len("numprocesses"):]) if existing_num_processes != num_processes: raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes)) for line in [line for line in lines[1:] if line.strip()]: idx = line.find("process") if idx == -1: raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line)) existing_process = int(line[idx+len('process'):]) if process_id == existing_process: raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file)) F = open(rescore_process_file, 'a') F.write("process %d\n" % process_id) F.close() else: F = open(rescore_process_file, 'w') F.write("numprocesses %d\n" % num_processes) F.write("process %d\n" % process_id) F.close() output_dir = os.path.join('rescoring', str(process_id)) if not(os.path.exists(output_dir)): os.makedirs(output_dir) abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir)) print("Running process in %s.\n" % abs_output_dir) ReallyFixedIDs = False results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),)) if not(FixedIDs) and results: raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision) else: # Hacky way to run multiple processes if ReallyFixedIDs: num_to_score = len(remaining_unscored) num_for_this_to_score = num_to_score / num_processes IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score] results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score)))) elif FixedIDs: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1)) else: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1)) count = 0 cases_computed = 0 total_time_in_secs = 0 number_of_cases_left = len(results) * len(radii) failed_cases = [] colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen') for r in results: t = Timer() t.add('Preamble') inner_count = 0 mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],)) mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations]) extracted_data = False details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID'])) experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID'] print('Experiment PDB file ID = %s' % experiment_pdbID) pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID'] print('UserDataSetExperiment PDB file ID = %s' % pdbID) count += 1 if True:#len(mutations) == 1: timestart = time.time() #mutation = mutations[0] dbchains = sorted(set([mutation['Chain'] for mutation in mutations])) # todo: note: assuming monomeric structures here assert(len(dbchains) == 1) dbchain = dbchains[0] #mutantaa = mutation['MutantAA'] ddG_dict = json.loads(r['Scores']) kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG'] #assert(ddG_dict['version'] == current_score_revision) all_done = True for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if not(ddG_dict['data'].get(score_name)): all_done = False else: cases_computed += 1 number_of_cases_left -= 1 if all_done: print('Prediction %d: done.' % r["ID"]) continue # Extract data t.add('Grab data') #archivefile = None #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value'] #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID']) #print(job_data_path) #assert(os.path.exists(job_data_path)) #archivefile = readBinaryFile(job_data_path) archivefile = DDG_interface.getData(r['ID']) zipfilename = os.path.join(output_dir, "%d.zip" % r['ID']) F = open(zipfilename, "wb") F.write(archivefile) F.close() t.add('Extract data') zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED) tmpdir = None repacked_files = [] mutant_files = [] rosetta_resids = [] try: tmpdir = makeTemp755Directory(output_dir) highestIndex = -1 foundResfile = False foundMutfile = False presumed_mutation = None for fname in sorted(zipped_content.namelist()): if fname.endswith(".pdb"): if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']): structnum = int(fname[fname.rindex('_')+1:-4]) if fname.startswith("%s/mut_" % r['ID']): if presumed_mutation: assert(presumed_mutation == os.path.split(fname)[1].split('_')[1]) else: presumed_mutation = os.path.split(fname)[1].split('_')[1] newfname = 'mutant_%02d' % structnum if fname.startswith("%s/repacked_" % r['ID']): newfname = 'repacked_%02d' % structnum highestIndex = max(highestIndex, structnum) newfilepath = os.path.join(tmpdir, newfname) writeFile(newfilepath, zipped_content.read(fname)) if fname.startswith("%s/mut_" % r['ID']): mutant_files.append(newfilepath) if fname.startswith("%s/repacked_" % r['ID']): repacked_files.append(newfilepath) #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']): # writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname)) if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.') foundResfile = True lines = zipped_content.read(fname).split("\n") assert(len(lines) == 3) assert(lines[0] == "NATAA") assert(lines[1] == "start") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 4) rosetta_resid = resfile_mutation[0] rosetta_chain = resfile_mutation[1] rosetta_mutaa = resfile_mutation[3] assert(mutantaa == rosetta_mutaa) assert(dbchain == rosetta_chain) assert(resfile_mutation[2] == 'PIKAA') assert(len(rosetta_mutaa) == 1) if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): foundMutfile = True lines = zipped_content.read(fname).split("\n") assert(lines[0].startswith('total ')) num_mutations = int(lines[0][6:]) assert(lines[1] == str(num_mutations)) # todo: note: assuming monomeric structures here rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] resfile_mutations = lines[2:] for resfile_mutation in resfile_mutations: resfile_mutation = resfile_mutation.split(" ") assert(len(resfile_mutation) == 3) rosetta_resids.append(resfile_mutation[1]) rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) # Make sure the wtaa->mutantaa types match the structures assert(not(foundResfile)) if not foundMutfile: raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.') input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],)) assert(len(input_files) == 1) lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n") #lines = regenerate_mutfile(r['ID']).split("\n") assert(len(lines) == 3) assert(lines[0] == "total 1") assert(lines[1] == "1") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 3) rosetta_resid = resfile_mutation[1] rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation) fullresids = [] for rosetta_resid in rosetta_resids: fullresid = None if rosetta_resid.isdigit(): fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid) else: assert(False) fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid) fullresids.append(fullresid) resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],)) assert(len(resultst1) == 1) ExperimentIDt1 = resultst1[0]['ExperimentID'] UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID'] if UserDataSetExperimentIDt1: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,)) else: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,)) assert(len(resultst2) == 1) prediction_PDB_ID = resultst2[0]['PDBFileID'] if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']: for fullresid in fullresids: wtaa = None for m in mutations: # Hack for ub_RPN13 if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_RPN13_yeast elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_OTU elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_OTU_yeast elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_UQcon elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501 wtaa = m['WildTypeAA'] # Hack for uby_UQcon elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287): wtaa = m['WildTypeAA'] elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip(): wtaa = m['WildTypeAA'] if (wtaa == None): colortext.error(prediction_PDB_ID) colortext.error('wtaa == None') colortext.error('fullresid = %s' % str(fullresid)) colortext.error(str(mutations)) colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids]) #sys.exit(0) assert(wtaa != None) assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa) #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa) for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if ddG_dict['data'].get(score_name): print('Radius %0.1f: done.' % radius) continue cases_computed += 1 number_of_cases_left -= 1 t.add('Radius %0.3f: repacked' % radius) colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange') repacked_score = NoahScore() repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.message("Repacked") print(repacked_score) t.add('Radius %0.3f: mutant' % radius) mutant_score = NoahScore() mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.printf("Mutant", color = 'cyan') print(mutant_score) t.add('Radius %0.3f: postamble' % radius) colortext.printf("ddG", color = 'lightpurple') ddg_score = repacked_score.ddg(mutant_score) print(ddg_score) colortext.printf("Liz's ddG", color = 'yellow') print("Total score: %0.3f" % kellogg_ddG) ddG_dict['version'] = '0.23' if ddG_dict['version'] == '0.1': ddG_dict['version'] = '0.21' ddG_dict['data'] = { 'kellogg' : { 'total' : ddG_dict['data'], }, 'noah': { 'total' : {'ddG' : ddg_score.total}, 'positional' : {'ddG' : ddg_score.positional}, 'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore}, }, } elif ddG_dict['version'] == '0.2': ddG_dict['version'] = '0.21' ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.22': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.23': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore jsonified_ddG = json.dumps(ddG_dict) ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],)) t.add('Cleanup') shutil.rmtree(tmpdir) os.remove(zipfilename) except Exception, e: print("Exception! In prediction %d" % r['ID'], str(e)) failed_cases.append(r['ID']) import traceback print(traceback.format_exc()) if tmpdir: shutil.rmtree(tmpdir) total_time_in_secs += t.sum() average_time_taken = float(total_time_in_secs)/float(cases_computed or 1) estimate_remaining_time = number_of_cases_left * average_time_taken t.stop() colortext.printf("**Profile**", 'orange') print(t) colortext.message("Time taken for this case: %0.2fs." % t.sum()) colortext.message("Average time taken per case: %0.2fs." % average_time_taken) colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60)) print("\n")
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False): '''This is the main function in this script and is where the basic analysis is compiled. output_directory should contain the results of the prediction run. data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details expectn specifies how many predictions we expect to find (useful in case some jobs failed). top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g. the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures. prefix is used to name the output files. ''' # Sanity check assert(top_x <= expectn) # Set up reference structures structures_folder = os.path.join('..', 'input', 'structures', '12_res') rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference') rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference') # Set up the per-case statistics dicts best_scoring_structures = {} median_scoring_structures = {} worst_scoring_structures = {} total_percent_subanstrom = {} top_x_percent_subanstrom = {} top_x_loop_prediction_sets = {} # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over # varying values of X used to select the TopX structures percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX'] percent_subangrom_by_top_x = {} # Set up the summary analysis file csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])] # Read in the benchmark input pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()] # Truncate the benchmark input for test mode if test_mode: pdb_ids = pdb_ids[:10] # Analyze the performance for each case in the benchmark for pdb_id in pdb_ids: rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb') assert(os.path.exists(rcsb_reference_pdb)) rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb') assert(os.path.exists(rosetta_reference_pdb)) assert(len(pdb_id) == 4) loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id)) loop_sets = json.loads(read_file(loops_file)) assert(len(loop_sets['LoopSet']) == 1) # Create a container for loop predictions loop_prediction_set = LoopPredictionSet() # Read the coordinates from the reference PDB file rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id)) details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode) for d in details: loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix']) print(' Done') # Compute the RMSD for this case for the structure using the pandas dataframe # It is more efficient to do this after truncation if truncating by score but in the general case users will # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen # in the loop above colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id)) loop_prediction_set.compute_rmsds(rcsb_reference_matrix) loop_prediction_set.check_rmsds(rosetta_reference_matrix) print(' Done\n') # Truncate the structures to the top expectn-scoring files loop_prediction_set.sort_by_score() loop_prediction_set.truncate(expectn) if len(loop_prediction_set) != expectn: print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set))) sys.exit(1) # Create a new set containing the top-X-scoring structures and identify the median-scoring structure top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x] median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)] # Determine the lowest-/best-scoring structure best_scoring_structures[pdb_id] = loop_prediction_set[0] best_score = best_scoring_structures[pdb_id].score worst_scoring_structures[pdb_id] = loop_prediction_set[-1] worst_score = worst_scoring_structures[pdb_id].score assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id]) # Print structures colortext.warning('Top{0} structures'.format(top_x)) print(top_x_loop_prediction_sets[pdb_id]) colortext.warning('Top1 structure') print(best_scoring_structures[pdb_id]) colortext.warning('Median (by score) structure') print(median_scoring_structures[pdb_id]) colortext.warning('Lowest-scoring structures') print(worst_scoring_structures[pdb_id]) # Create values for TopX variable plot loop_prediction_set.sort_by_score() for top_x_var in range(1, len(loop_prediction_set) + 1): new_subset = loop_prediction_set[:top_x_var] percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0) percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom)) percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {}) percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0) top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0) colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id])) colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id])) loop_prediction_set.sort_by_rmsd() closest_rmsd = loop_prediction_set[0].rmsd closest_score = loop_prediction_set[0].score colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd)) colortext.warning('Score of closest model: {0}'.format(closest_score)) top_1_rmsd = best_scoring_structures[pdb_id].rmsd top_x_rmsd = best_scoring_structures[pdb_id].rmsd top_x_score = best_scoring_structures[pdb_id].score for s in top_x_loop_prediction_sets[pdb_id]: if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score): top_x_rmsd = s.rmsd top_x_score = s.score assert(top_x_score <= worst_score) assert(top_x_rmsd <= top_1_rmsd) print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd)) print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd)) csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd]))) # Add a column of median percent subangstrom values for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()): assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids)) median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2] percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value)) write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file)) write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file)) write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
def setup_jobs(outpath, options, input_files): ''' This function sets up the jobs by creating the necessary input files as expected. - outpath is where the output is to be stored. - options is the optparse options object. - input_files is a list of paths to input files. ''' job_inputs = None reverse_mapping = None fasta_file_contents = {} # Generate FASTA files for PDB inputs # fasta_file_contents is a mapping from a file path to a pair (FASTA contents, file type). We remember the file type # since we offset residue IDs depending on file type i.e. for FASTA files, we treat each sequence separately and do # not renumber the fragments in postprocessing. For PDB files, however, we need to respect the order and length of # sequences so that we renumber the fragments appropriately in postprocessing - we assume that if a PDB file is passed in # then all chains (protein, RNA, or DNA) will be used in a Rosetta run. for input_file in input_files: assert(not(fasta_file_contents.get(input_file))) if any(fnmatch(input_file, x) for x in pdb_file_wildcards): pdb = PDB.from_filepath(input_file, strict=True) pdb.pdb_id = os.path.basename(input_file).split('.')[0] if pdb.pdb_id.startswith('pdb') and len(pdb.pdb_id) >= 7: # Hack to rename FASTA identifiers for pdb*.ent files which are present in mirrors of the PDB pdb.pdb_id = pdb.pdb_id.replace('pdb', '') fasta_file_contents[input_file] = (pdb.create_fasta(prefer_seqres_order = False), 'PDB') else: fasta_file_contents[input_file] = (read_file(input_file), 'FASTA') # Extract sequences from the input FASTA files. found_sequences, reverse_mapping, errors = get_sequences(options, fasta_file_contents) if found_sequences: reformat(found_sequences) if errors: return None, False, errors # Discard sequences that are the wrong chain. desired_sequences = {} for key, sequence in found_sequences.iteritems(): pdb_id, chain, file_name = key if options.chain is None or chain == options.chain: desired_sequences[key] = sequence # Create the input FASTA and script files. job_inputs, errors = create_inputs(options, outpath, desired_sequences) # Create the reverse mapping file if reverse_mapping: segment_mapping_file = os.path.join(outpath, "segment_map.json") colorprinter.message("Creating a reverse mapping file %s." % segment_mapping_file) write_file(segment_mapping_file, json.dumps(reverse_mapping)) # Create the post-processing script file post_processing_script = read_file(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'post_processing.py')) write_file(os.path.join(outpath, 'post_processing.py'), post_processing_script, 'w') # Create the secondary structure filter file if options.secondary_structure_file: write_file(os.path.join(outpath, 'ss_filter.json'), json.dumps({'secondary_structure_filter' : SecondaryStructureDefinition.from_filepath(options.secondary_structure_file).data}), 'w') return job_inputs, reverse_mapping != None, errors
# align_two_simple_sequences(fasta_sequence, uniparc_sequence, sequence1name = '%s:%s|PDBID|CHAIN|SEQUENCE' % (pdb_id, c), sequence2name = uniparc_id) # sanity check - see if uniprotAC in pdb is in the list of the matched uniprot id print(chains) sys.exit(0) px = PDBML.retrieve('1A2C', cache_dir='/home/oconchus/temp') for k, v in sorted(px.atom_to_seqres_sequence_maps.iteritems(), key=lambda x:(x[0], x[1])): print(k,v) p = PDB.from_filepath('../.testdata/1H38.pdb') # has protein, DNA, RNA p = PDB.from_filepath('../.testdata/1ZC8.pdb') p = PDB.from_filepath('../.testdata/4IHY.pdb') #p = PDB('../.testdata/2GRB.pdb') p = PDB.from_filepath('../.testdata/1J1M.pdb') p = PDB.from_filepath('../.testdata/1H38.pdb') p = PDB.from_filepath('../.testdata/1A2C.pdb') #print(p.structure_lines) colortext.message("Resolution") print(p.get_resolution()) colortext.message("Techniques") print(p.get_techniques())