Пример #1
0
    def retrieve(pdb_id, cache_dir = None, bio_cache = None):
        '''Creates a FASTA object by using a cached copy of the file if it exists or by retrieving the file from the RCSB.'''

        pdb_id = pdb_id.upper()

        if bio_cache:
            return FASTA(bio_cache.get_fasta_contents(pdb_id))

        # Check to see whether we have a cached copy
        if cache_dir:
            filename = os.path.join(cache_dir, "%s.fasta" % pdb_id)
            if os.path.exists(filename):
                return FASTA(read_file(filename))
            else:
                filename += ".txt"
                if os.path.exists(filename):
                    return FASTA(read_file(filename))

        # Get a copy from the RCSB
        contents = rcsb.retrieve_fasta(pdb_id)

        # Create a cached copy if appropriate
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.fasta" % pdb_id), contents)

        # Return the object
        return FASTA(contents)
Пример #2
0
    def retrieve(pdb_id, cache_dir = None):
        '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB.'''

        pdb_contents = None
        xml_contents = None
        pdb_id = pdb_id.upper()

        if cache_dir:
            # Check to see whether we have a cached copy of the PDB file
            filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
            if os.path.exists(filename):
                pdb_contents = read_file(filename)

            # Check to see whether we have a cached copy of the XML file
            filename = os.path.join(cache_dir, "%s.xml" % pdb_id)
            if os.path.exists(filename):
                xml_contents = read_file(filename)

        # Get any missing files from the RCSB and create cached copies if appropriate
        if not pdb_contents:
            pdb_contents = rcsb.retrieve_pdb(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)

        if not xml_contents:
            xml_contents = rcsb.retrieve_xml(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.xml" % pdb_id), xml_contents)

        # Return the object
        return PDBML_slow(xml_contents, pdb_contents)
def create_pruned_structures(output_directory):
    loop_definitions_12 = json.loads(
        read_file('../structures/12_res/loop_definitions.json'))
    prepare_structures('../structures/12_res/rcsb/reference/*.pdb',
                       os.path.join(output_directory, '12_res_rcsb'),
                       loop_definitions_12,
                       expected_min_loop_length=12,
                       expected_max_loop_length=12)
    prepare_structures('../structures/12_res/rosetta/preminimized/*.pdb',
                       os.path.join(output_directory, '12_res_rosetta'),
                       loop_definitions_12,
                       expected_min_loop_length=12,
                       expected_max_loop_length=12)
    loop_definitions_14_17 = json.loads(
        read_file('../structures/14_17_res/loop_definitions.json'))
    prepare_structures('../structures/14_17_res/rcsb/reference/*.pdb',
                       os.path.join(output_directory, '14_17_res'),
                       loop_definitions_14_17,
                       expected_min_loop_length=14,
                       expected_max_loop_length=17,
                       remove_hetatm=True)
    prepare_structures('../structures/14_17_res/rosetta/reference/*.pdb',
                       os.path.join(output_directory, '14_17_res'),
                       loop_definitions_14_17,
                       expected_min_loop_length=14,
                       expected_max_loop_length=17,
                       remove_hetatm=True)
Пример #4
0
    def add_backbone_atoms_linearly_from_loop_filepaths(self, loop_json_filepath, fasta_filepath, residue_ids):
        '''A utility wrapper around add_backbone_atoms_linearly. Adds backbone atoms in a straight line from the first to
           the last residue of residue_ids.

           loop_json_filepath is a path to a JSON file using the JSON format for Rosetta loops files. This file identifies
           the insertion points of the sequence.

           fasta_filepath is a path to a FASTA file with one sequence. This sequence will be used as the sequence for
           the inserted residues (between the start and stop residues defined in loop_json_filepath).

           residue_ids is a list of PDB chain residues (columns 22-27 of ATOM lines in the PDB format). It is assumed that
           they are sequential although the logic does not depend on that. This list should have the length length as the
           sequence identified in the FASTA file.
        '''

        # Parse the loop file
        loop_def = json.loads(read_file(loop_json_filepath))
        assert(len(loop_def['LoopSet']) == 1)
        start_res = loop_def['LoopSet'][0]['start']
        end_res = loop_def['LoopSet'][0]['stop']
        start_res = PDB.ChainResidueID2String(start_res['chainID'], (str(start_res['resSeq']) + start_res['iCode']).strip())
        end_res = PDB.ChainResidueID2String(end_res['chainID'], (str(end_res['resSeq']) + end_res['iCode']).strip())
        assert(start_res in residue_ids)
        assert(end_res in residue_ids)

        # Parse the FASTA file and extract the sequence
        f = FASTA(read_file(fasta_filepath), strict = False)
        assert(len(f.get_sequences()) == 1)
        insertion_sequence = f.sequences[0][2]
        if not len(residue_ids) == len(insertion_sequence):
            raise Exception('The sequence in the FASTA file must have the same length as the list of residues.')

        # Create the insertion sequence (a sub-sequence of the FASTA sequence)
        # The post-condition is that the start and end residues are the first and last elements of kept_residues respectively
        kept_residues = []
        insertion_residue_map = {}
        in_section = False
        found_end = False
        for x in range(len(residue_ids)):
            residue_id = residue_ids[x]
            if residue_id == start_res:
                in_section = True
            if in_section:
                kept_residues.append(residue_id)
                insertion_residue_map[residue_id] = insertion_sequence[x]
                if residue_id == end_res:
                    found_end = True
                    break
        if not kept_residues:
            raise Exception('The insertion sequence is empty (check the start and end residue ids).')
        if not found_end:
            raise Exception('The end residue was not encountered when iterating over the insertion sequence (check the start and end residue ids).')

        # Identify the start and end Residue objects
        try:
            start_res = self.residues[start_res[0]][start_res[1:]]
            end_res = self.residues[end_res[0]][end_res[1:]]
        except Exception, e:
            raise Exception('The start or end residue could not be found in the PDB file.')
Пример #5
0
    def retrieve(pdb_id, cache_dir = None, acceptable_sequence_percentage_match = 70.0, require_uniprot_residue_mapping = True, bio_cache = None):
        '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB.
           bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly.
        '''

        pdb_contents = None
        xml_contents = None
        pdb_id = pdb_id.upper()

        l_pdb_id = pdb_id.lower()

        if len(pdb_id) != 4 or not pdb_id.isalnum():
            raise Exception("Bad PDB identifier '%s'." % pdb_id)

        if bio_cache:
            pdb_contents = bio_cache.get_pdb_contents(pdb_id)
            xml_contents = bio_cache.get_sifts_xml_contents(pdb_id)

        if cache_dir:
            if not pdb_contents:
                # Check to see whether we have a cached copy of the PDB file
                filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
                if os.path.exists(filename):
                    pdb_contents = read_file(filename)

            if not xml_contents:
                # Check to see whether we have a cached copy of the XML file
                filename = os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id)
                if os.path.exists(filename):
                    xml_contents = read_file(filename)

        # Get any missing files from the RCSB and create cached copies if appropriate
        if not pdb_contents:
            pdb_contents = rcsb.retrieve_pdb(pdb_id)
            if cache_dir:
                write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents)

        if not xml_contents:
            try:
                xml_contents = retrieve_xml(pdb_id, silent = False)
                if cache_dir:
                    write_file(os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id), xml_contents)
            except FTPException550:
                raise MissingSIFTSRecord('The file "%s.sifts.xml.gz" could not be found on the EBI FTP server.' % l_pdb_id)

        xml_contents = xml_contents

        # Return the object
        handler = SIFTS(xml_contents, pdb_contents, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = cache_dir, require_uniprot_residue_mapping = require_uniprot_residue_mapping, bio_cache = bio_cache, pdb_id = pdb_id)
        xml.sax.parseString(xml_contents, handler)
        return handler
Пример #6
0
    def __init__(self):
        pdb_chain_to_pfam_mapping = {}
        pfam_to_pdb_chain_mapping = {}
        lines = read_file(pdb_to_pfam_mapping_file).split('\n')

        for c in range(len(lines)):
            if not lines[c].startswith('#'):
                break
        assert(lines[c].split() == ['PDB', 'CHAIN', 'SP_PRIMARY', 'PFAM_ID'])
        for l in lines[c:]:
            if l.strip():
                tokens = l.split()

                pdb_id = tokens[0].lower()
                chain_id = tokens[1]
                pfam_acc = tokens[3]
                pdb_key = (pdb_id, chain_id)
                pdb_chain_to_pfam_mapping[pdb_id] = pdb_chain_to_pfam_mapping.get(pdb_id, {})
                pdb_chain_to_pfam_mapping[pdb_id][chain_id] = pdb_chain_to_pfam_mapping[pdb_id].get(chain_id, set())
                pdb_chain_to_pfam_mapping[pdb_id][chain_id].add(pfam_acc)

                pfam_to_pdb_chain_mapping[pfam_acc] = pfam_to_pdb_chain_mapping.get(pfam_acc, set())
                pfam_to_pdb_chain_mapping[pfam_acc].add(pdb_key)

        self.pdb_chain_to_pfam_mapping = pdb_chain_to_pfam_mapping
        self.pfam_to_pdb_chain_mapping = pfam_to_pdb_chain_mapping
Пример #7
0
def download_fasta(pdb_id, dest_dir, silent = True, filename = None):
    assert(os.path.exists(dest_dir))
    lower_case_filename = os.path.join(dest_dir, '{0}.fasta'.format(pdb_id.lower()))
    upper_case_filename = os.path.join(dest_dir, '{0}.fasta'.format(pdb_id.upper()))
    if filename:
        requested_filename = os.path.join(dest_dir, filename)
        if os.path.exists(requested_filename):
            return read_file(requested_filename)
    if os.path.exists(lower_case_filename):
        return read_file(lower_case_filename)
    elif os.path.exists(upper_case_filename):
        return read_file(upper_case_filename)
    else:
        contents = retrieve_fasta(pdb_id, silent = silent)
        write_file(os.path.join(dest_dir, filename or '{0}.fasta'.format(pdb_id)), contents)
        return contents
Пример #8
0
    def compute(self):
        tmp_dir = self.tmp_dir
        if not self.read_only:
            pdb_object = self.pdb.clone()
        else:
            pdb_object = self.pdb  # in general, we should not be modifying the structure in this class
        input_filepath = write_temp_file(tmp_dir, pdb_object.get_content(), ftype = 'w', prefix = 'dssp_')
        output_filepath = write_temp_file(tmp_dir, '', ftype = 'w', prefix = 'dssp_')
        try:
            p = _Popen('.', shlex.split('mkdssp -i {input_filepath} -o {output_filepath}'.format(**locals())))
            if p.errorcode:
                if p.stderr.find('empty protein, or no valid complete residues') != -1:
                    raise MissingAtomException(p.stdout)
                else:
                    raise Exception('An error occurred while calling DSSP:\n%s' % p.stderr)

            self.dssp_output = read_file(output_filepath)
            self.dssp = self.parse_output()
        except MissingAtomException as e:
            os.remove(input_filepath)
            os.remove(output_filepath)
            raise
        except Exception as e:
            os.remove(input_filepath)
            os.remove(output_filepath)
            raise colortext.Exception('%s\n%s' % (str(e), traceback.format_exc()))
        os.remove(input_filepath)
        os.remove(output_filepath)
def update_public_datasets():
    dsets = ['alascan-gpk.json', 'curatedprotherm.json', 'guerois.json', 'kellogg.json', 'potapov.json']
    source_path = '../rawdata/'
    dest_path = '/home/oconchus/t14benchmarking/ddg/input/json/'
    for dset in dsets:
        assert(os.path.exists(os.path.join(source_path, dset)))
        assert(os.path.exists(os.path.join(dest_path, dset)))
    for dset in dsets:
        print(dset)
        source_set = json.loads(read_file(os.path.join(source_path, dset)))
        dest_set = json.loads(read_file(os.path.join(dest_path, dset)))
        assert(len(source_set['data']) == len(dest_set['data']))
        for x in range(len(source_set['data'])):
            assert(dest_set['data'][x]['RecordID'] == source_set['data'][x]['RecordID'])
            dest_set['data'][x]['DerivedMutation'] = source_set['data'][x]['DerivedMutation']
        write_file(os.path.join(dest_path, dset) + '.new', json.dumps(dest_set, indent=4, sort_keys=True))
Пример #10
0
 def compute(self, chain_id):
     tmp_dir = self.tmp_dir
     pdb_object = self.pdb.clone()  # we are immediately modifying the PDB file by stripping chains so we need to make a copy
     pdb_object.strip_to_chains(chain_id)
     input_filepath = write_temp_file(tmp_dir, pdb_object.get_content(), ftype = 'w', prefix = 'dssp_')
     output_filepath = write_temp_file(tmp_dir, '', ftype = 'w', prefix = 'dssp_')
     try:
         p = _Popen('.', shlex.split('mkdssp -i {input_filepath} -o {output_filepath}'.format(**locals())))
         if p.errorcode:
             if p.stderr.find('empty protein, or no valid complete residues') != -1:
                 raise MissingAtomException(p.stdout)
             else:
                 raise Exception('An error occurred while calling DSSP:\n%s' % p.stderr)
         self.dssp_output[chain_id] = read_file(output_filepath)
         self.dssp[chain_id] = self.parse_output(chain_id)
     except MissingAtomException as e:
         os.remove(input_filepath)
         os.remove(output_filepath)
         raise
     except Exception as e:
         os.remove(input_filepath)
         os.remove(output_filepath)
         raise colortext.Exception('%s\n%s' % (str(e), traceback.format_exc()))
     os.remove(input_filepath)
     os.remove(output_filepath)
Пример #11
0
    def _runRScript(r_script_filename, cwd = '.', remove_output = True):
        # Reset to new current working directory
        tmp_dir = False
        if cwd == None:
            tmp_dir = True
            cwd = tempfile.mkdtemp( prefix = '%s-%s-%s_' % (time.strftime("%y%m%d"), getpass.getuser(), 'plot-working-dir') )

        rscriptname = write_temp_file(cwd, r_script_filename)
        p = subprocess.Popen(["R", "CMD", "BATCH", rscriptname], cwd = cwd)
        while True:
            time.sleep(0.3)
            errcode = p.poll()
            if errcode != None:
                break
        rout = "%s.Rout" % rscriptname
        os.remove(rscriptname)

        rout_contents = None
        if os.path.exists(rout):
            rout_contents = read_file(rout)
            os.remove(rout)

        if errcode != 0:
            print(rout_contents )
            raise Exception("The R script failed with error code %d." % errcode)

        if tmp_dir and remove_output:
            shutil.rmtree(cwd)

        return rout_contents
Пример #12
0
 def load_sequence_blast(self, sequence, cut_off, matrix, sequence_identity_cut_off):
     if self.cache_dir:
         filepath = self._get_blast_sequence_filepath(sequence, cut_off, matrix, sequence_identity_cut_off)
         if os.path.exists(filepath):
             for sequence_hits in json.loads(read_file(filepath)):
                 if sequence_hits['sequence'] == sequence:
                     return sequence_hits
     return None
Пример #13
0
def download_xml(pdb_id, dest_dir, silent = True, filename = None, unzip = False):
    assert(os.path.exists(dest_dir))
    lower_case_gz_filename = os.path.join(dest_dir, '{0}.sifts.xml.gz'.format(pdb_id.lower()))
    upper_case_gz_filename = os.path.join(dest_dir, '{0}.sifts.xml.gz'.format(pdb_id.upper()))
    lower_case_filename = os.path.join(dest_dir, '{0}.sifts.xml'.format(pdb_id.lower()))
    upper_case_filename = os.path.join(dest_dir, '{0}.sifts.xml'.format(pdb_id.upper()))

    if filename:
        requested_filename = os.path.join(dest_dir, filename)
        if os.path.exists(requested_filename):
            return read_file(requested_filename)

    if unzip == True:
        if os.path.exists(lower_case_filename):
            contents = read_file(lower_case_filename)
        elif os.path.exists(upper_case_filename):
            contents = read_file(upper_case_filename)
        elif os.path.exists(lower_case_gz_filename):
            contents = read_gzip_in_memory(read_file(lower_case_gz_filename))
        elif os.path.exists(upper_case_gz_filename):
            contents = read_gzip_in_memory(read_file(upper_case_gz_filename))
        else:
            contents = retrieve_xml(pdb_id, silent = silent, unzip = True)
            write_file(os.path.join(dest_dir, filename or '{0}.sifts.xml'.format(pdb_id)), contents)
        return contents
    else:
        if os.path.exists(lower_case_gz_filename):
            contents = read_file(lower_case_gz_filename) # Note: read_file already unzips .gz files
        if os.path.exists(upper_case_gz_filename):
            contents = read_file(upper_case_gz_filename) # Note: read_file already unzips .gz files
        else:
            gzip_contents = retrieve_xml(pdb_id, silent = silent, unzip = False)
            write_file(os.path.join(dest_dir, filename or '{0}.sifts.xml.gz'.format(pdb_id)), gzip_contents)
            contents = read_gzip_in_memory(gzip_contents)
        return contents
Пример #14
0
 def correlation_coefficient_gplot(inputfname, output_filename, filetype, experiment_field = "Experimental", title = ''):
     '''File suffix: pearsons_r_gplot
        Description: Pearson's r
        Filename: ggplot_pearsons.R
        Priority: 1
        '''
     script_path = os.path.abspath(os.path.dirname(inspect.getsourcefile(sys.modules[__name__])))
     r_script_filename = read_file(os.path.join(script_path, "ggplot_pearsons.R")) % vars()
     return RInterface._runRScript(r_script_filename)
Пример #15
0
 def sendgmail2(self, subject, recipients, plaintext, htmltext=None, cc=None, debug=False, useMIMEMultipart=True, gmail_account = '*****@*****.**', pw_filepath = None):
     '''For this function to work, the password for the gmail user must be colocated with this file or passed in.'''
     smtpserver = smtplib.SMTP("smtp.gmail.com", 587)
     smtpserver.ehlo()
     smtpserver.starttls()
     smtpserver.ehlo
     gmail_account = '*****@*****.**'
     if pw_filepath:
         smtpserver.login(gmail_account, read_file(pw_filepath))
     else:
         smtpserver.login(gmail_account, read_file('pw'))
     for recipient in recipients:
         header = 'To:' + recipient + '\n' + 'From: ' + gmail_account + '\n' + 'Subject:' + subject + '\n'
         if htmltext:
             msg = header + '\n ' + htmltext + '\n\n'
         else:
             msg = header + '\n ' + plaintext + '\n\n'
         smtpserver.sendmail(gmail_account, recipient, msg)
     smtpserver.close()
Пример #16
0
 def generate_schema_diagram(self, output_filepath = None, show_fk_only = False):
     if self.num_tables == 0:
         raise EmptyDiagramException('No tables in schema.')
     tempfiles = self._generate_schema_diagram(show_fk_only)
     self.schema_diagram = read_file(tempfiles[1])
     for fname in tempfiles:
         if os.path.exists(fname):
             os.remove(fname)
     if output_filepath:
         write_file(output_filepath, self.schema_diagram)
Пример #17
0
def load():
    global sys_settings
    if not sys_settings:
        settings_file = os.path.splitext(os.path.abspath(__file__))[0] + '.json'
        if not os.path.exists(settings_file):
            create_template(settings_file)
            colortext.warning('\nThe settings file {0} needs to be configured. Exiting.\n'.format(settings_file))
            sys.exit(1)
        d = json.loads(read_file(settings_file))
        sys_settings = NestedBunch(d)
    return sys_settings
Пример #18
0
    def retrieve_data_from_rcsb(cls, ligand_code, pdb_id = None, silent = True, cached_dir = None):
        '''Retrieve a file from the RCSB.'''
        if not silent:
            colortext.printf("Retrieving data from RCSB")
        if cached_dir:
            assert(os.path.exists(cached_dir))

        ligand_info_path, ligand_info, pdb_ligand_info, pdb_ligand_info_path = None, None, None, None
        if cached_dir:
            ligand_info_path = os.path.join(cached_dir, '{0}.cif'.format(ligand_code))
            if os.path.exists(ligand_info_path):
                ligand_info = read_file(ligand_info_path)
        if not ligand_info:
            ligand_info = retrieve_ligand_cif(ligand_code)
            if cached_dir:
                write_file(ligand_info_path, ligand_info)

        # Parse .cif
        l = cls(ligand_code)
        l.parse_cif(ligand_info)
        l.pdb_id = pdb_id or l.pdb_id
        has_pdb_id = l.pdb_id and (len(l.pdb_id) == 4) and (l.pdb_id != '?')  # the last case is unnecessary and will be short-cut but I included it to show possible values

        # Parse PDB XML
        if has_pdb_id:
            if cached_dir:
                pdb_ligand_info_path = os.path.join(cached_dir, '{0}.pdb.ligandinfo'.format(l.pdb_id.lower()))
                if os.path.exists(pdb_ligand_info_path):
                    pdb_ligand_info = read_file(pdb_ligand_info_path)
                else:
                    pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id)
                    write_file(pdb_ligand_info_path, pdb_ligand_info)
            else:
                pdb_ligand_info = retrieve_pdb_ligand_info(l.pdb_id)
        if pdb_ligand_info:
            l.parse_pdb_ligand_info(pdb_ligand_info)

        # Retrive the diagram image
        l.get_diagram()

        return l
Пример #19
0
 def save_sequence_blast(self, sequence, cut_off, matrix, sequence_identity_cut_off, data):
     assert(data['sequence'] == sequence)
     sequence_data = [data] # put the new hit at the start of the file
     if self.cache_dir:
         filepath = self._get_blast_sequence_filepath(sequence, cut_off, matrix, sequence_identity_cut_off)
         if os.path.exists(filepath):
             for sequence_hits in json.loads(read_file(filepath)):
                 if sequence_hits['sequence'] != sequence:
                     sequence_data.append(sequence_hits)
         write_file(filepath, json.dumps(sequence_data))
         return True
     return False
def dump_pdbs():
    pdbs = json.loads(read_file('../rawdata/pdbs.json'))

    # Sanity check
    for pdb_id, v in sorted(pdbs.iteritems()):
        records = ddGdb.execute_select('SELECT ID FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))
        assert(len(records) == 1)

    # Dump
    for pdb_id, v in sorted(pdbs.iteritems()):
        content = ddGdb.execute_select('SELECT Content FROM PDBFile WHERE ID=%s', parameters=(pdb_id,))[0]['Content']
        write_file('../rawdata/%s.pdb' % pdb_id, content)
Пример #21
0
    def __init__(self, UniProtAC, XML = None, cache_dir = None, silent = True):
        if cache_dir and not(os.path.exists(cache_dir)):
            raise Exception("The cache directory %s does not exist." % cache_dir)

        self.UniProtAC = UniProtAC
        self.silent = silent

        # Get XML
        if XML == None:
            protein_xml = None
            cached_filepath = None
            if cache_dir:
                cached_filepath = os.path.join(cache_dir, '%s.xml' % UniProtAC)
            if cached_filepath and os.path.exists(cached_filepath):
                protein_xml = read_file(cached_filepath)
            else:
                if not silent:
                    colortext.write("Retrieving %s\n" % UniProtAC, "cyan")
                url = 'http://www.uniprot.org/uniprot/%s.xml' % UniProtAC
                protein_xml = http_get(url)
                if not(protein_xml.strip()):
                    raise EmptyUniProtACXMLException('The file %s is empty.' % UniProtAC)
                if cached_filepath:
                    write_file(cached_filepath, protein_xml)
            self.XML = protein_xml
        else:
            self.XML = XML

        self.recommended_name = None
        self.submitted_names = []
        self.alternative_names = []

        # Get DOM
        try:
            self._dom = parseString(protein_xml)
        except:
            if cached_filepath:
                raise Exception("The UniProtAC XML for '%s' was invalid. The cached file is located at %s. Check this file - if it is not valid XML then delete the file and rerun the script." % (UniProtAC, cached_filepath))
            else:
                raise Exception("The UniProtAC XML for '%s' was invalid." % UniProtAC)
        main_tags = self._dom.getElementsByTagName("uniprot")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]

        self._parse_evidence_tag()
        self._parse_sequence_tag()
        self._parse_protein_tag()
        self._parse_organism_tag()
        self._parse_subsections()
        self._parse_PDB_mapping()
Пример #22
0
 def get_schema(self, host, username, passwd, database_name):
     try:
         outfile, outfilename = open_temp_file('/tmp', "w")
         p = subprocess.Popen(shlex.split("mysqldump -h %s -u %s -p%s --skip-add-drop-table --no-data %s" % (host, username, passwd, database_name)), stdout=outfile)
         p.wait()
         outfile.close()
         contents = read_file(outfilename)
         os.remove(outfilename)
         return contents
     except Exception, e:
         if os.path.exists(outfilename):
             os.remove(outfilename)
         raise
Пример #23
0
def load():
    global sys_settings
    if not sys_settings:
        settings_file = os.path.splitext(
            os.path.abspath(__file__))[0] + '.json'
        if not os.path.exists(settings_file):
            create_template(settings_file)
            colortext.warning(
                '\nThe settings file {0} needs to be configured. Exiting.\n'.
                format(settings_file))
            sys.exit(1)
        d = json.loads(read_file(settings_file))
        sys_settings = NestedBunch(d)
    return sys_settings
Пример #24
0
    def __init__(self, settings = {}, isInnoDB=True, numTries=32, host=None, db=None, user=None, passwd=None, port=3306, unix_socket="/var/lib/mysql/mysql.sock", passwdfile=None, use_utf=False):

        self.db = db
        self.host = host
        self.original_schema = []

        if not(os.path.exists(unix_socket)):
            unix_socket = '/var/run/mysqld/mysqld.sock' # Ubuntu hack

        if not passwd and passwdfile:
            if os.path.exists(passwdfile):
                passwd = read_file(passwdfile).strip()
            else:
                passwd = getpass.getpass("Enter password to connect to MySQL database:")

        dbinterface = MySQLDatabaseInterface(settings, isInnoDB = isInnoDB, numTries = numTries, host = host, db = db, user = user, passwd = passwd, port = port, unix_socket = unix_socket, use_locking = False)

        # Get the DB schema, normalizing for sqlt-diagram
        db_schema = []
        self.num_tables = 0
        try:
            for t in sorted(dbinterface.TableNames):
              creation_string = dbinterface.execute_select('SHOW CREATE TABLE `%s`' % t)
              assert(len(creation_string) == 1)
              if creation_string[0].get('Create Table') == None: # e.g. for views
                  continue
              self.num_tables += 1
              creation_string = '%s;' % creation_string[0]['Create Table'].strip()
              self.original_schema.append(creation_string)

              # Fix input for sqlt-diagram (it is fussy)
              creation_string = creation_string.replace("default ''", "")
              creation_string = creation_string.replace("DEFAULT ''", "")
              creation_string = creation_string.replace("DEFERRABLE INITIALLY DEFERRED", "") # sqlt-diagram doesn't like this syntax for MySQL
              creation_string = creation_string.replace("AUTOINCREMENT", "") # sqlt-diagram doesn't like this syntax for MySQL
              creation_string = creation_string.replace("auto_increment", "") # sqlt-diagram doesn't like this syntax for MySQL
              creation_string = re.sub("COMMENT.*'.*'", "", creation_string, re.DOTALL) # sqlt-diagram doesn't like this syntax for MySQL
              creation_string = re.sub("CONSTRAINT.*?CHECK.*?,", "", creation_string, re.DOTALL) # sqlt-diagram doesn't like this syntax for MySQL
              creation_string = re.sub("CONSTRAINT.*?CHECK.*?[)][)]", ")", creation_string, re.DOTALL) # sqlt-diagram doesn't like this syntax for MySQL
              creation_string = re.sub(" AUTO_INCREMENT=\d+", "", creation_string, re.DOTALL)
              creation_string = creation_string.replace("''", "")
              creation_string = creation_string.replace('tg_', 'auth_')
              db_schema.append(creation_string)
        except: raise
        db_schema = '\n\n'.join(db_schema)
        self.db_schema = db_schema
        self.mysqldump_schema = self.get_schema(host, user, passwd, db)
def dump_data(prediction_set, outfile):
    ddG_connection = db_api.ddG()
    ddGdb = ddgdbapi.ddGDatabase()

    userdata_set = 'AllValidPGPK'

    cached_pdb_details = json.loads(read_file('cached_pdb_details.json'))
    analysis_breakdown = ddG_connection.get_predictionset_data(prediction_set, userdata_set, cached_pdb_details = cached_pdb_details)

    test_data = dict(
        amino_acids = analysis_breakdown.amino_acids,
        pdb_details = analysis_breakdown.pdb_details,
        predictions = analysis_breakdown.predictions,
        #single_mutation_GP_predictions = analysis_breakdown.single_mutation_GP_predictions,
        #single_mutation_no_GP_predictions = analysis_breakdown.single_mutation_no_GP_predictions,
        #multiple_mutation_predictions = analysis_breakdown.multiple_mutation_predictions,
        analysis_datasets = analysis_breakdown.analysis_datasets,
    )
    write_file(outfile, json.dumps(test_data))
Пример #26
0
    def retrieve(cls, pdb_id, cache_dir = None):
        '''Creates a PDB object by using a cached copy of the file if it exists or by retrieving the file from the RCSB.'''

        # Check to see whether we have a cached copy
        pdb_id = pdb_id.upper()
        if cache_dir:
            filename = os.path.join(cache_dir, "%s.pdb" % pdb_id)
            if os.path.exists(filename):
                return cls(read_file(filename))

        # Get a copy from the RCSB
        contents = rcsb.retrieve_pdb(pdb_id)

        # Create a cached copy if appropriate
        if cache_dir:
            write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), contents)

        # Return the object
        return cls(contents)
Пример #27
0
def run_r_script(r_script_filename, cwd = '.'):
    '''This function was adapted from the covariation benchmark.'''
    p = subprocess.Popen(["R", "CMD", "BATCH", r_script_filename], cwd = cwd)
    while True:
        time.sleep(0.3)
        errcode = p.poll()
        if errcode != None:
            break
    rout = "{0}out".format(r_script_filename)
    rout_contents = None
    if os.path.exists(rout):
        rout_contents = read_file(rout)
        os.remove(rout)
    rdata_file = os.path.join(os.path.split(r_script_filename)[0], '.RData')
    if os.path.exists(rdata_file):
        os.remove(rdata_file)
    if errcode != 0:
        print(rout_contents)
        raise Exception("The R script failed with error code %d." % errcode)
    return rout_contents
Пример #28
0
def import_structures():
    setup()
    ppi_api = get_ppi_api()
    complex_definitions = json.loads(read_file('tinas_complexes.json'))
    for tina_pdb_id, complex_structure_definition_pair in sorted(complex_definitions.iteritems()):
        #if tina_pdb_id != '1WA52':
        #    continue
        colortext.warning(tina_pdb_id)
        del complex_structure_definition_pair['Structure']['file_path']
        complex_structure_definition_pair['Structure']['pdb_object'] = tina_pdb_objects[tina_pdb_id]
        pdb_set = ppi_api.add_complex_structure_pair(complex_structure_definition_pair, keywords = ['GSP1'],
                                                     force = True, trust_database_content = False, allow_missing_params_files = False, debug = False)
        if pdb_set['success'] == False:
            print(pdb_set['error'])
            if 'possible_matches' in pdb_set:
                for d in pdb_set['possible_matches']:
                    colortext.warning(d['ID'])
                    print('{0}, {1}, {2}'.format(d['LName'].encode('utf-8').strip(), d['LShortName'].encode('utf-8').strip(), d['LHTMLName'].encode('utf-8').strip()))
                    print('{0}, {1}, {2}'.format(d['RName'].encode('utf-8').strip(), d['RShortName'].encode('utf-8').strip(), d['RHTMLName'].encode('utf-8').strip()))

    create_project_pdb_records()
Пример #29
0
    def _get_XML(self):
        uparc_xml = None
        cached_filepath = None
        if self.cache_dir:
            cached_filepath = os.path.join(self.cache_dir, '%s.xml' % self.UniParcID)
        if cached_filepath and os.path.exists(cached_filepath):
            uparc_xml = read_file(cached_filepath)
        else:
            if not self.silent:
                colortext.write("Retrieving %s\n" % self.UniParcID, "cyan")
            url = 'http://www.uniprot.org/uniparc/%s.xml' % self.UniParcID
            uparc_xml = http_get(url)
            if cached_filepath:
                write_file(cached_filepath, uparc_xml)
        self.XML = uparc_xml

        # Get DOM
        self._dom = parseString(uparc_xml)
        main_tags = self._dom.getElementsByTagName("uniparc")
        assert(len(main_tags) == 1)
        entry_tags = main_tags[0].getElementsByTagName("entry")
        assert(len(entry_tags) == 1)
        self.entry_tag = entry_tags[0]
Пример #30
0
def test_abacus_graph():
    '''This function can be deleted. It was added to test the abacus graph with different numbers of datapoints.'''
    import os
    import json
    if not(os.path.exists('results_cache.txt')):
        results = ddG_connection.get_flattened_prediction_results('FPP biosensor: protocol 16')
        for r in results:
            r['TimeTaken'] = r['TimeTaken'].total_seconds() # timedelta objects are not JSON serializable
        write_file('results_cache.txt', json.dumps(results), 'w')
    results = json.loads(read_file('results_cache.txt'))

    try:
        ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_3.png', cached_results = results, num_datapoints = 3)
    except:
        pass
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_5.png', cached_results = results, num_datapoints = 5)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_8.png', cached_results = results, num_datapoints = 8)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_10.png', cached_results = results, num_datapoints = 10)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_12.png', cached_results = results, num_datapoints = 12)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_20.png', cached_results = results, num_datapoints = 20)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_50.png', cached_results = results, num_datapoints = 50)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_127.png', cached_results = results, num_datapoints = 127)
    ddG_connection.create_abacus_graph_for_a_single_structure('FPP biosensor: protocol 16', 'kellogg', 'total', graph_filename = 'test_255.png', cached_results = results, num_datapoints = 255)
Пример #31
0
    def run(self):

        # Create input files
        self._create_temp_directory()
        self._create_input_files()
        self._create_script()
        write_file(self._filepath('script.pml'), self.script)

        # Run PyMOL
        #colortext.message(self.visualization_pymol +' -c ' + self._filepath('script.pml'))
        po = tprocess.Popen(self.outdir, [self.visualization_pymol, '-c', self._filepath('script.pml')])
        #colortext.message(po.stdout)
        #colortext.warning(po.errorcode)
        #colortext.error(po.stderr)
        self.stdout = po.stdout
        self.stderr = po.stderr
        self.return_code = po.errorcode

        if self.return_code != 0:
            raise Exception('Error: %s' % str(self.stderr))

        pse_path = self._filepath('session.pse')
        if os.path.exists(pse_path):
            self.PSE = read_file(pse_path, binary = True)
Пример #32
0
 def from_file(pymol_name, pdb_filename, residues_of_interest = []):
     return PDBContainer(pymol_name, read_file(pdb_filename), residues_of_interest)
Пример #33
0
from klab.bio.basics import ChainMutation
from klab.fs.fsio import read_file, write_temp_file, open_temp_file, write_file
from klab.bio.pfam import Pfam
from klab.bio.dssp import MonomerDSSP, ComplexDSSP, MissingAtomException
from klab.bio.ligand import Ligand, PDBLigand
from klab.bio.pdbtm import PDBTM
from klab.db.sqlalchemy_interface import get_single_record_from_query, get_or_create_in_transaction

from kddg.api.schema import test_schema_against_database_instance
from kddg.api.schema import PDBFile, PDBChain, PDBMolecule, PDBMoleculeChain, PDBResidue, LigandDescriptor, LigandIdentifier, LigandSynonym, PDBLigand
from kddg.api.schema import Ligand as DBLigand
#from kddg.api.schema import Publication, PublicationAuthor, PublicationIdentifier
from kddg.api.layers import *
from kddg.api.db import ddG, PartialDataException, SanityCheckException
import kddg.api.dbi as dbi

rosetta_scripts_path =  '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease'
rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database'
p = PDB(read_file('/kortemmelab/data/kyleb/ddg_numbering_for_shane/24548-data/1CBW_FGHI.pdb'))
#p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path)
p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path, extra_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res')
pprint.pprint(p.get_atom_sequence_to_rosetta_map())
pprint.pprint(p.rosetta_sequences)

from kddg.api.ppi import get_interface as get_ppi_interface
ppi_api = get_ppi_interface(read_file('../misc/ddgdb.pw'),
                                rosetta_scripts_path =  '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease',
                                rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database')
content = ppi_api.DDG_db.execute_select('SELECT Content FROM PDBFile WHERE ID="1CBW"')[0]['Content']
print(content)
write_file('/tmp/ddginterface/1CBW_FGHI_db.pdb', content)
def prepare_structures(file_filter,
                       output_directory,
                       loop_definitions,
                       require_filter=True,
                       create_partial_structures=False,
                       expected_min_loop_length=None,
                       expected_max_loop_length=None,
                       remove_hetatm=False):
    search_radius = 10.0

    if not (os.path.exists(output_directory)):
        os.mkdir(output_directory)

    # Iterate through the dataset cases
    for pdb_file in sorted(glob.glob(file_filter)):

        pdb_prefix = os.path.splitext(os.path.split(pdb_file)[1])[0].lower()

        # Read the benchmark loop definition
        if not loop_definitions.get(pdb_prefix):
            raise Exception(
                'The loop definition for {0} is missing.'.format(pdb_prefix))
        loop_definition = loop_definitions[pdb_prefix]
        loops = [
            PDBSection(loop_definition['chainID'],
                       loop_definition['StartResidueID'],
                       loop_definition['EndResidueID'],
                       Sequence=loop_definition['Sequence'])
        ]

        # Only process files that passed the benchmark criteria
        if require_filter and not loop_definition['PassedFilter']:
            continue

        # Read in the PDB content, removing HETATM lines if requested
        pdb_content = read_file(pdb_file)
        if remove_hetatm:
            new_content = []
            for l in pdb_content.split('\n'):
                if not l.startswith('HETATM'):
                    new_content.append(l)
            pdb_content = '\n'.join(new_content)

        # Remove the loops and surrounding sidechain atoms from the structure
        b = Bonsai(pdb_content)
        bonsai, cutting, PSE_file, PSE_script, FASTA_file = b.prune_loop_for_kic(
            loops,
            search_radius,
            expected_min_loop_length=expected_min_loop_length,
            expected_max_loop_length=expected_max_loop_length,
            generate_pymol_session=True)

        # Create a PyMOL session file for visual inspection
        write_file(
            os.path.join(output_directory, '{0}.pse'.format(pdb_prefix)),
            PSE_file)

        # Create the new PDB file with the loop and surrounding sidechains removed
        write_file(
            os.path.join(output_directory, '{0}.pdb'.format(pdb_prefix)),
            bonsai)
        if create_partial_structures:
            write_file(
                os.path.join(
                    output_directory,
                    '{0}_missing_loop_and_surrounding_sidechains.pdb'.format(
                        pdb_prefix)), bonsai)
            write_file(
                os.path.join(
                    output_directory,
                    '{0}_loop_and_surrounding_sidechains.pdb'.format(
                        pdb_prefix)), cutting)

        # Create the FASTA file containing the loop sequence. This will be used along with the loop_definitions.json file
        # to add the residues back into the Rosetta structure
        write_file(
            os.path.join(output_directory, '{0}.fasta'.format(pdb_prefix)),
            FASTA_file)

        sys.stdout.write('.')
        sys.stdout.flush()

    print('')
Пример #35
0
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False):
    '''This is the main function in this script and is where the basic analysis is compiled.

       output_directory should contain the results of the prediction run.
       data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details
       expectn specifies how many predictions we expect to find (useful in case some jobs failed).
       top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g.
       the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures.
       prefix is used to name the output files.
    '''

    # Sanity check
    assert(top_x <= expectn)

    # Set up reference structures
    structures_folder = os.path.join('..', 'input', 'structures', '12_res')
    rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference')
    rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference')

    # Set up the per-case statistics dicts
    best_scoring_structures = {}
    median_scoring_structures = {}
    worst_scoring_structures = {}
    total_percent_subanstrom = {}
    top_x_percent_subanstrom = {}
    top_x_loop_prediction_sets = {}

    # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over
    # varying values of X used to select the TopX structures
    percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX']
    percent_subangrom_by_top_x = {}

    # Set up the summary analysis file
    csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])]

    # Read in the benchmark input
    pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()]

    # Truncate the benchmark input for test mode
    if test_mode:
        pdb_ids = pdb_ids[:10]

    # Analyze the performance for each case in the benchmark
    for pdb_id in pdb_ids:

        rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb')
        assert(os.path.exists(rcsb_reference_pdb))
        rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb')
        assert(os.path.exists(rosetta_reference_pdb))
        assert(len(pdb_id) == 4)
        loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id))
        loop_sets = json.loads(read_file(loops_file))
        assert(len(loop_sets['LoopSet']) == 1)

        # Create a container for loop predictions
        loop_prediction_set = LoopPredictionSet()

        # Read the coordinates from the reference PDB file
        rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)
        rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4)

        colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id))
        details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode)
        for d in details:
            loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix'])
        print(' Done')

        # Compute the RMSD for this case for the structure using the pandas dataframe
        # It is more efficient to do this after truncation if truncating by score but in the general case users will
        # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen
        # in the loop above
        colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id))
        loop_prediction_set.compute_rmsds(rcsb_reference_matrix)
        loop_prediction_set.check_rmsds(rosetta_reference_matrix)
        print(' Done\n')

        # Truncate the structures to the top expectn-scoring files
        loop_prediction_set.sort_by_score()
        loop_prediction_set.truncate(expectn)
        if len(loop_prediction_set) != expectn:
            print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set)))
            sys.exit(1)

        # Create a new set containing the top-X-scoring structures and identify the median-scoring structure
        top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x]
        median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)]

        # Determine the lowest-/best-scoring structure
        best_scoring_structures[pdb_id] = loop_prediction_set[0]
        best_score = best_scoring_structures[pdb_id].score
        worst_scoring_structures[pdb_id] = loop_prediction_set[-1]
        worst_score = worst_scoring_structures[pdb_id].score
        assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id])

        # Print structures
        colortext.warning('Top{0} structures'.format(top_x))
        print(top_x_loop_prediction_sets[pdb_id])
        colortext.warning('Top1 structure')
        print(best_scoring_structures[pdb_id])
        colortext.warning('Median (by score) structure')
        print(median_scoring_structures[pdb_id])
        colortext.warning('Lowest-scoring structures')
        print(worst_scoring_structures[pdb_id])

        # Create values for TopX variable plot
        loop_prediction_set.sort_by_score()
        for top_x_var in range(1, len(loop_prediction_set) + 1):
            new_subset = loop_prediction_set[:top_x_var]
            percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0)
            percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom))
            percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {})
            percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom

        total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0)
        top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0)
        colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id]))
        colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id]))

        loop_prediction_set.sort_by_rmsd()
        closest_rmsd = loop_prediction_set[0].rmsd
        closest_score = loop_prediction_set[0].score
        colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd))
        colortext.warning('Score of closest model: {0}'.format(closest_score))

        top_1_rmsd = best_scoring_structures[pdb_id].rmsd

        top_x_rmsd = best_scoring_structures[pdb_id].rmsd
        top_x_score = best_scoring_structures[pdb_id].score
        for s in top_x_loop_prediction_sets[pdb_id]:
            if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score):
                top_x_rmsd = s.rmsd
                top_x_score = s.score
        assert(top_x_score <= worst_score)
        assert(top_x_rmsd <= top_1_rmsd)

        print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd))
        print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd))

        csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd])))

    # Add a column of median percent subangstrom values
    for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()):
        assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids))
        median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2]
        percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value))

    write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file))
    write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
        # Iterate through the dataset cases
        colortext.message(
            'Adding loop residues back to the pruned structures in {0}.'.
            format(pruned_structure_directory))
        file_filter = os.path.join(pruned_structure_directory, '*.pdb')
        for pdb_file in sorted(glob.glob(file_filter)):
            pdb_prefix = os.path.splitext(
                os.path.split(pdb_file)[1])[0].lower()
            file_prefix = os.path.splitext(pdb_file)[0]
            fasta_file = file_prefix + '.fasta'
            loop_file = file_prefix + '.loop.json'
            assert (os.path.exists(fasta_file))
            assert (os.path.exists(loop_file))

            # Convert the FASTA headers back into PDB residue IDs
            fasta_contents = read_file(fasta_file)
            headers = [
                l for l in fasta_contents.split('\n') if l.startswith('>')
            ]
            assert (len(headers) == 1)
            header = headers[0]
            pdb_residue_ids = [
                PDB.ChainResidueID2String(l[0], l[1:])
                for l in header[header.find('Residues ') + 9:].split(';')
            ]

            # Add the missing atoms atoms back into the PDB file
            spackler = Spackler.from_filepath(pdb_file)
            new_pdb_content = spackler.add_backbone_atoms_linearly_from_loop_filepaths(
                loop_file, fasta_file, pdb_residue_ids)
            write_file(