def run_maxcluster_cluster(predictions, method="average", rmsd=True, clustering_threshold=None, binary="maxcluster"): """ Compare a set of predicted structures to an experimental structure using maxcluster. For clustering functionality, use run_maxcluster_clustering() function. Parameters ---------- predictions : list(str) List of PDB files that should be compared against experiment method : {"single", "average", "maximum", "pairs_min", "pairs_abs"}, optional (default: "average") Clustering method (single / average / maximum linkage, or min / absolute size neighbour pairs clustering_threshold : float (optional, default: None) Initial clustering threshold (maxcluster -T option) rmsd : bool, optional (default: True) Use RMSD-based clustering (faster) binary : str, optional (default: "maxcluster") Path to maxcluster binary Returns ------- pandas.DataFrame Clustering result table (see parse_maxcluster_clustering for more detailed explanation) """ # create a list of files for input to maxcluster list_file = temp() with open(list_file, "w") as f: for pred_file in predictions: f.write(pred_file + "\n") method_map = { "single": 1, "average": 2, "maximum": 3, "pairs_min": 4, "pairs_abs": 5, } if method not in method_map: raise InvalidParameterError("Method must be one of the following: " + ", ".join(method_map.keys())) cmd = [binary, "-l", list_file, "-C", str(method_map[method])] if rmsd: cmd += ["-rmsd"] if clustering_threshold is not None: cmd += ["-T", str(clustering_threshold)] return_code, stdout, stderr = run(cmd) return parse_maxcluster_clustering(stdout)
def run_maxcluster_compare(predictions, experiment, normalization_length=None, distance_cutoff=None, binary="maxcluster"): """ Compare a set of predicted structures to an experimental structure using maxcluster. For clustering functionality, use run_maxcluster_clustering() function. For a high-level wrapper around this function that removes problematic atoms and compares multiple models, please look at evcouplings.fold.protocol.compare_models_maxcluster(). Parameters ---------- predictions : list(str) List of PDB files that should be compared against experiment experiment : str Path of experimental structure PDB file. Note that the numbering and residues in this file must agree with the predicted structure, and that the structure may not contain duplicate atoms (multiple models, or alternative locations for the same atom). normalization_length : int, optional (default: None) Use this length to normalize the Template Modeling (TM) score (-N option of maxcluster). If None, will normalize by length of experiment. distance_cutoff : float, optional (default: None) Distance cutoff for MaxSub search (-d option of maxcluster). If None, will use maxcluster auto-calibration. binary : str, optional (default: "maxcluster") Path to maxcluster binary Returns ------- pandas.DataFrame Comparison result table (see parse_maxcluster_comparison for more detailed explanation) """ # create a list of files for input to maxcluster list_file = temp() with open(list_file, "w") as f: for pred_file in predictions: f.write(pred_file + "\n") cmd = [binary, "-l", list_file, "-e", experiment] # normalization length for TM score calculation if normalization_length is not None: cmd += ["-N", str(normalization_length)] # distance cutoff for MaxSub search if distance_cutoff is not None: cmd += ["-d", str(distance_cutoff)] return_code, stdout, stderr = run(cmd) return parse_maxcluster_comparison(stdout)
def _eliminate_altloc(chain): # if multiple locations, select the one with the # highest occupancy chain.coords = chain.coords.loc[chain.coords.groupby( ["residue_index", "atom_name"]).occupancy.idxmax()] # save cut chain to temporary file temp_filename = temp() with open(temp_filename, "w") as f: chain.to_file(f) return temp_filename
def cns_seq_file(sequence, output_file=None, residues_per_line=16): """ Generate a CNS .seq file for a given protein sequence Parameters ---------- sequence : str Amino acid sequence in one-letter code output_file : str, optional (default: None) Save 3-letter code sequence to this file (if None, will create temporary file) residues_per_line : int, optional (default: 16) Print this many residues on each line of .seq file Returns ------- output_file : str Path to file with sequence (useful if temporary file was generated) Raises ------ InvalidParameterError If sequence contains invalid symbol """ if output_file is None: output_file = temp() with open(output_file, "w") as f: # split sequence into parts per line lines = [ sequence[i:i + residues_per_line] for i in range(0, len(sequence), residues_per_line) ] # go through lines and transform into 3-letter code for line in lines: try: l3 = " ".join([AA1_to_AA3[aa] for aa in line]) except KeyError as e: raise InvalidParameterError( "Invalid amino acid could not be mapped") from e f.write(l3 + "\n") return output_file
def _read_hmmer_table(filename, column_names): """ Parse a HMMER file in (dom)tbl format into a pandas DataFrame. (Why this is necessary: cannot easily split on whitespace with pandas because of last column that contains whitespace both in header and rows) Parameters ---------- filename : str Path of (dom)tbl file column_names : list of str Columns in the respective format (different for tbl and domtbl) Returns ------- pd.DataFrame DataFrame with parsed (dom)tbl """ res = [] num_splits = len(column_names) - 1 with open(filename) as f: for line in f: if line.startswith("#"): continue fields = line.rstrip().split(maxsplit=num_splits) res.append(fields) # at the moment, all fields in dataframe are strings, even # if numeric. To convert to numbers, cheap trick is to store # to csv file and let pandas guess the types, rather than # going through convert_objects (deprecated) or to_numeric # (more effort) tempfile = temp() pd.DataFrame(res, columns=column_names).to_csv(tempfile, index=False) return pd.read_csv(tempfile)
def _create_mapping_table(self, sifts_table_file): """ Create modified SIFTS mapping table (based on file at SIFTS_URL). For some of the entries, the Uniprot sequence ranges do not map to a SEQRES sequence range of the same length. These PDB IDs will be entirely replaced by a segment- based mapping extracted from the SIFTS REST API. Parameters ---------- sifts_table_file : str Path where computed table will be stored """ def extract_rows(M, pdb_id): res = [] M = M[pdb_id.lower()]["UniProt"] for uniprot_ac, Ms in M.items(): for x in Ms["mappings"]: res.append({ "pdb_id": pdb_id, "pdb_chain": x["chain_id"], "uniprot_ac": uniprot_ac, "resseq_start": x["start"]["residue_number"], "resseq_end": x["end"]["residue_number"], "coord_start": (str(x["start"]["author_residue_number"]) + x["start"]["author_insertion_code"].replace(" ", "")), "coord_end": (str(x["end"]["author_residue_number"]) + x["end"]["author_insertion_code"].replace(" ", "")), "uniprot_start": x["unp_start"], "uniprot_end": x["unp_end"], }) return res # download SIFTS table (gzip-compressed csv) to temp file temp_download_file = temp() get_urllib(SIFTS_URL, temp_download_file) # load table and rename columns for internal use, if SIFTS # ever decided to rename theirs table = pd.read_csv(temp_download_file, comment="#", compression="gzip").rename( columns={ "PDB": "pdb_id", "CHAIN": "pdb_chain", "SP_PRIMARY": "uniprot_ac", "RES_BEG": "resseq_start", "RES_END": "resseq_end", "PDB_BEG": "coord_start", "PDB_END": "coord_end", "SP_BEG": "uniprot_start", "SP_END": "uniprot_end", }) # TODO: remove the following if new segment-based table proves as robust solution """ # this block disabled for now due to use of new table # based on observed UniProt segments # - can probably be removed eventually # identify problematic PDB IDs problematic_ids = table.query( "(resseq_end - resseq_start) != (uniprot_end - uniprot_start)" ).pdb_id.unique() # collect new mappings from segment based REST API res = [] for i, pdb_id in enumerate(problematic_ids): r = requests.get( SIFTS_REST_API.format(pdb_id.lower()) ) mapping = json.loads(r.text) res += extract_rows(mapping, pdb_id) # remove bad PDB IDs from table and add new mapping new_table = table.loc[~table.pdb_id.isin(problematic_ids)] # also disabled due to use of new table based on observed # UniProt segments - can probably be removed eventually new_table = new_table.append( pd.DataFrame(res).loc[:, table.columns] ) """ # save for later reuse table.to_csv(sifts_table_file, index=False)