def get_kic_run_details(output_directory, pdb_id, loop_sets, test_mode = False): '''This function returns the details required to set up the analysis for the Rosetta KIC and NGK methods.''' details = [] c = 0 for sc_file in glob.glob(os.path.join(output_directory, '{0}*.sc'.format(pdb_id))): # Determine the id sc_filename = os.path.split(sc_file)[1] assert(sc_filename.startswith('{0}_score'.format(pdb_id))) run_id = int(sc_filename[10:-3]) # Determine the score sc_lines = [l.strip() for l in get_file_lines(sc_file) if l.strip()] assert(sc_lines[0] == 'SEQUENCE:') assert(sc_lines[1].split()[:2] == ['SCORE:', 'total_score']) assert(sc_lines[2].split()[0] == 'SCORE:') total_score = float(sc_lines[2].split()[1]) # Determine the filepath of the predicted structure associated_pdb_file = os.path.join(output_directory, '{0}_{0}{1}_0001.pdb'.format(pdb_id, run_id)) # Extract the PDB coordinates into a pandas dataframe (HDF5 format) assert(os.path.exists(associated_pdb_file)) hdf5_file = os.path.splitext(associated_pdb_file)[0] + '.hdf5' if os.path.exists(hdf5_file): store = pandas.HDFStore(hdf5_file) pdb_loop_residue_matrix = store['dataframe'] store.close() else: pdb_loop_residue_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(associated_pdb_file).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) store = pandas.HDFStore(hdf5_file) store['dataframe'] = pdb_loop_residue_matrix store.close() details.append(dict( id = run_id, score = total_score, predicted_structure = associated_pdb_file, pdb_loop_residue_matrix = pdb_loop_residue_matrix, )) if test_mode: c += 1 if c >= 10: break return details
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False): '''This is the main function in this script and is where the basic analysis is compiled. output_directory should contain the results of the prediction run. data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details expectn specifies how many predictions we expect to find (useful in case some jobs failed). top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g. the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures. prefix is used to name the output files. ''' # Sanity check assert(top_x <= expectn) # Set up reference structures structures_folder = os.path.join('..', 'input', 'structures', '12_res') rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference') rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference') # Set up the per-case statistics dicts best_scoring_structures = {} median_scoring_structures = {} worst_scoring_structures = {} total_percent_subanstrom = {} top_x_percent_subanstrom = {} top_x_loop_prediction_sets = {} # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over # varying values of X used to select the TopX structures percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX'] percent_subangrom_by_top_x = {} # Set up the summary analysis file csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])] # Read in the benchmark input pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()] # Truncate the benchmark input for test mode if test_mode: pdb_ids = pdb_ids[:10] # Analyze the performance for each case in the benchmark for pdb_id in pdb_ids: rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb') assert(os.path.exists(rcsb_reference_pdb)) rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb') assert(os.path.exists(rosetta_reference_pdb)) assert(len(pdb_id) == 4) loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id)) loop_sets = json.loads(read_file(loops_file)) assert(len(loop_sets['LoopSet']) == 1) # Create a container for loop predictions loop_prediction_set = LoopPredictionSet() # Read the coordinates from the reference PDB file rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id)) details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode) for d in details: loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix']) print(' Done') # Compute the RMSD for this case for the structure using the pandas dataframe # It is more efficient to do this after truncation if truncating by score but in the general case users will # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen # in the loop above colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id)) loop_prediction_set.compute_rmsds(rcsb_reference_matrix) loop_prediction_set.check_rmsds(rosetta_reference_matrix) print(' Done\n') # Truncate the structures to the top expectn-scoring files loop_prediction_set.sort_by_score() loop_prediction_set.truncate(expectn) if len(loop_prediction_set) != expectn: print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set))) sys.exit(1) # Create a new set containing the top-X-scoring structures and identify the median-scoring structure top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x] median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)] # Determine the lowest-/best-scoring structure best_scoring_structures[pdb_id] = loop_prediction_set[0] best_score = best_scoring_structures[pdb_id].score worst_scoring_structures[pdb_id] = loop_prediction_set[-1] worst_score = worst_scoring_structures[pdb_id].score assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id]) # Print structures colortext.warning('Top{0} structures'.format(top_x)) print(top_x_loop_prediction_sets[pdb_id]) colortext.warning('Top1 structure') print(best_scoring_structures[pdb_id]) colortext.warning('Median (by score) structure') print(median_scoring_structures[pdb_id]) colortext.warning('Lowest-scoring structures') print(worst_scoring_structures[pdb_id]) # Create values for TopX variable plot loop_prediction_set.sort_by_score() for top_x_var in range(1, len(loop_prediction_set) + 1): new_subset = loop_prediction_set[:top_x_var] percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0) percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom)) percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {}) percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0) top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0) colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id])) colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id])) loop_prediction_set.sort_by_rmsd() closest_rmsd = loop_prediction_set[0].rmsd closest_score = loop_prediction_set[0].score colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd)) colortext.warning('Score of closest model: {0}'.format(closest_score)) top_1_rmsd = best_scoring_structures[pdb_id].rmsd top_x_rmsd = best_scoring_structures[pdb_id].rmsd top_x_score = best_scoring_structures[pdb_id].score for s in top_x_loop_prediction_sets[pdb_id]: if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score): top_x_rmsd = s.rmsd top_x_score = s.score assert(top_x_score <= worst_score) assert(top_x_rmsd <= top_1_rmsd) print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd)) print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd)) csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd]))) # Add a column of median percent subangstrom values for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()): assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids)) median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2] percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value)) write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file)) write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file)) write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))