예제 #1
0
def get_kic_run_details(output_directory, pdb_id, loop_sets, test_mode = False):
    '''This function returns the details required to set up the analysis for the Rosetta KIC and NGK methods.'''
    details = []
    c = 0
    for sc_file in glob.glob(os.path.join(output_directory, '{0}*.sc'.format(pdb_id))):

        # Determine the id
        sc_filename = os.path.split(sc_file)[1]
        assert(sc_filename.startswith('{0}_score'.format(pdb_id)))
        run_id = int(sc_filename[10:-3])

        # Determine the score
        sc_lines = [l.strip() for l in get_file_lines(sc_file) if l.strip()]
        assert(sc_lines[0] == 'SEQUENCE:')
        assert(sc_lines[1].split()[:2] == ['SCORE:', 'total_score'])
        assert(sc_lines[2].split()[0] == 'SCORE:')
        total_score = float(sc_lines[2].split()[1])

        # Determine the filepath of the predicted structure
        associated_pdb_file = os.path.join(output_directory, '{0}_{0}{1}_0001.pdb'.format(pdb_id, run_id))

        # Extract the PDB coordinates into a pandas dataframe (HDF5 format)
        assert(os.path.exists(associated_pdb_file))
        hdf5_file = os.path.splitext(associated_pdb_file)[0] + '.hdf5'
        if os.path.exists(hdf5_file):
            store = pandas.HDFStore(hdf5_file)
            pdb_loop_residue_matrix = store['dataframe']
            store.close()
        else:
            pdb_loop_residue_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(associated_pdb_file).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
            store = pandas.HDFStore(hdf5_file)
            store['dataframe'] = pdb_loop_residue_matrix
            store.close()

        details.append(dict(
            id = run_id,
            score = total_score,
            predicted_structure = associated_pdb_file,
            pdb_loop_residue_matrix = pdb_loop_residue_matrix,
        ))
        if test_mode:
            c += 1
            if c >= 10:
                break

    return details
예제 #2
0
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False):
    '''This is the main function in this script and is where the basic analysis is compiled.

       output_directory should contain the results of the prediction run.
       data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details
       expectn specifies how many predictions we expect to find (useful in case some jobs failed).
       top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g.
       the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures.
       prefix is used to name the output files.
    '''

    # Sanity check
    assert(top_x <= expectn)

    # Set up reference structures
    structures_folder = os.path.join('..', 'input', 'structures', '12_res')
    rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference')
    rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference')

    # Set up the per-case statistics dicts
    best_scoring_structures = {}
    median_scoring_structures = {}
    worst_scoring_structures = {}
    total_percent_subanstrom = {}
    top_x_percent_subanstrom = {}
    top_x_loop_prediction_sets = {}

    # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over
    # varying values of X used to select the TopX structures
    percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX']
    percent_subangrom_by_top_x = {}

    # Set up the summary analysis file
    csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])]

    # Read in the benchmark input
    pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()]

    # Truncate the benchmark input for test mode
    if test_mode:
        pdb_ids = pdb_ids[:10]

    # Analyze the performance for each case in the benchmark
    for pdb_id in pdb_ids:

        rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb')
        assert(os.path.exists(rcsb_reference_pdb))
        rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb')
        assert(os.path.exists(rosetta_reference_pdb))
        assert(len(pdb_id) == 4)
        loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id))
        loop_sets = json.loads(read_file(loops_file))
        assert(len(loop_sets['LoopSet']) == 1)

        # Create a container for loop predictions
        loop_prediction_set = LoopPredictionSet()

        # Read the coordinates from the reference PDB file
        rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
        rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)

        colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id))
        details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode)
        for d in details:
            loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix'])
        print(' Done')

        # Compute the RMSD for this case for the structure using the pandas dataframe
        # It is more efficient to do this after truncation if truncating by score but in the general case users will
        # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen
        # in the loop above
        colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id))
        loop_prediction_set.compute_rmsds(rcsb_reference_matrix)
        loop_prediction_set.check_rmsds(rosetta_reference_matrix)
        print(' Done\n')

        # Truncate the structures to the top expectn-scoring files
        loop_prediction_set.sort_by_score()
        loop_prediction_set.truncate(expectn)
        if len(loop_prediction_set) != expectn:
            print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set)))
            sys.exit(1)

        # Create a new set containing the top-X-scoring structures and identify the median-scoring structure
        top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x]
        median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)]

        # Determine the lowest-/best-scoring structure
        best_scoring_structures[pdb_id] = loop_prediction_set[0]
        best_score = best_scoring_structures[pdb_id].score
        worst_scoring_structures[pdb_id] = loop_prediction_set[-1]
        worst_score = worst_scoring_structures[pdb_id].score
        assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id])

        # Print structures
        colortext.warning('Top{0} structures'.format(top_x))
        print(top_x_loop_prediction_sets[pdb_id])
        colortext.warning('Top1 structure')
        print(best_scoring_structures[pdb_id])
        colortext.warning('Median (by score) structure')
        print(median_scoring_structures[pdb_id])
        colortext.warning('Lowest-scoring structures')
        print(worst_scoring_structures[pdb_id])

        # Create values for TopX variable plot
        loop_prediction_set.sort_by_score()
        for top_x_var in range(1, len(loop_prediction_set) + 1):
            new_subset = loop_prediction_set[:top_x_var]
            percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0)
            percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom))
            percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {})
            percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom

        total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0)
        top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0)
        colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id]))
        colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id]))

        loop_prediction_set.sort_by_rmsd()
        closest_rmsd = loop_prediction_set[0].rmsd
        closest_score = loop_prediction_set[0].score
        colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd))
        colortext.warning('Score of closest model: {0}'.format(closest_score))

        top_1_rmsd = best_scoring_structures[pdb_id].rmsd

        top_x_rmsd = best_scoring_structures[pdb_id].rmsd
        top_x_score = best_scoring_structures[pdb_id].score
        for s in top_x_loop_prediction_sets[pdb_id]:
            if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score):
                top_x_rmsd = s.rmsd
                top_x_score = s.score
        assert(top_x_score <= worst_score)
        assert(top_x_rmsd <= top_1_rmsd)

        print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd))
        print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd))

        csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd])))

    # Add a column of median percent subangstrom values
    for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()):
        assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids))
        median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2]
        percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value))

    write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))