def valid_shaded_region(shaded_regions, n_res): """ Function that ensures that the passed shaded region are readable and make sense. Parameters ------------ shaded_regions : list of lists A list of lists, where sub-elements are of length 2 and contain start and end values for regions to be shaded. Assumes that sanity checking on positions has already been done. Default is None, but if there were specific regions you wanted to highlight this might, for example, look like shaded_regions=[[1,10],[40,50]], which would shade between 1 and 10 and then between 40 and 50. This can be useful to either highlight specific IDRs or specific folded domains Returns --------- None No return type but will raise various possible exceptions in a structured way if the shaded regions info is not parseable Raises -------- MetapredictError """ if shaded_regions is None: return # check shaded regions make sense: try: for b in shaded_regions: if b[0] < 1 or b[0] > n_res + 1: raise MetapredictError( f'Invalid start position in shaded_regions: {b[0]}') if b[1] < 1 or b[1] > n_res + 1: raise MetapredictError( f'Invalid end position in shaded_regions: {b[0]}') except Exception as e: raise MetapredictError( 'Error in parsing shaded_regions - full error below\n\n%s' % (str(e)))
def fetch_sequence(uniprot_id, return_full_id=False): """ Function that returns the amino acid sequence by polling UniProt.com Note that right now the test for success is a bit hap-hazard (looks for the string "Sorry", which appears if the UniProt call fails. We probably want something a bit more robust in the future... Parameters -------------- uniprot_id : str Uniprot accession number return_full_id : bool Whether to return the full uniprot ID. If set to True, returns a list where the first element is the full uniprot ID, the second element is the sequence, and the third element is the short uniprot ID. Returns ----------- str or None: If the call is succesfull, this returns the amino acid string. If not, it returns None. """ http = urllib3.PoolManager() r = http.request('GET', 'https://www.uniprot.org/uniprot/%s.fasta' % (uniprot_id)) y = "".join(str(r.data).split('\\n')[:1]).replace("'", "")[1:] s = "".join(str(r.data).split('\\n')[1:]).replace("'", "") # make sure that the last character is not a " due to a ' in protein name # Thank you to Github user keithchev for pointing out this bug! if s[len(s) - 1] == '"': s = s[:len(s) - 1] if s.find('Sorry') > -1: raise MetapredictError( 'Error: unable to fetch UniProt sequence with accession %s' % (uniprot_id)) if return_full_id == False: return s else: return [y, s, uniprot_id]
def write_csv(input_dict, output_file): """ Function that writes the scores in an input dictionary out to a standardized CVS file format. Parameters ----------- input_dict : dict Dictionary where keys are headers/identifiers and values is a list of per-residue disorder score output_file : str Location and filename for the output file. Assumes .csv is provided. Returns -------- None No return value, but writes a .csv file to disk """ # try and open the file and throw exception if anything goes wrong try: fh = open(output_file, 'w') except Exception: raise MetapredictError('Unable to write to file destination %s' % (output_file)) # for each entry for idx in input_dict: # important otherwise commmas in FASTA headers render the CSV file unreadable! no_comma = idx.replace(',', ' ') fh.write('%s' % (no_comma)) # for each score write for score in input_dict[idx]: fh.write(', %1.3f' % (score)) fh.write('\n')
def validate_options(option, valid_list): """ Function that raises an exception if $option is not found in $valid_list. Parameters ---------- option : str Option that has been passed valid_list : str List of strings which we expect $option to be one of Returns --------- None No return type by riases MetapredictError if option not found in valid_list """ if option not in valid_list: raise MetapredictError( 'Expected one of %s but only option passed was %s' % (str(valid_list), option))
def valid_range(inval, minval, maxval): if inval < minval or inval > maxval: raise MetapredictError( 'Value %1.3f is outside of range [%1.3f, %1.3f]' % (inval, minval, maxval))
def write_caid_format(input_dict, output_file): ''' Function that takes in a dictionary and outputs a file in the format as specified by IDPcentrail Critical Assessment of Intrinsic protein Disorder (CAID). Format is as follows - ouptut is a plain text output where the prediction has an entry header >entry_id header, similar to the beginning of a .fasta file Every line following the entry_id contains tab separated columns with columns ordered as follows - 1) residue number, 2) residue name, 3) confidence score, 4) binary classification where 1 = disordered and 0 = not disordered. Example (from idpcentral.org/caid): >P04637 1 M 0.892 1 2 E 0.813 1 Parameters ---------- input_dict : dict input dictionary of disorder scores. The Key should be the entry_id as as string and the associated value should be a list where the first element of the list is the corresponding sequence as a string and the second item of the list is the corresponding predictions as float values. Returns ------- None Does not return anything to the user. Writes a file saved to either the current directory or to a specified file path. ''' # first make a list of all of the keys in the dict entry_ids = [] for entry_id in input_dict.keys(): entry_ids.append(entry_id) # attempt to write to output file, raise MetapredictError if unable to try: current_output = open(output_file, 'w') except Exception: raise MetapredictError(f'Unable to write to {output_file}') # now iterate through the dict and append the necessary values per line for ids in entry_ids: cur_id = ids cur_sequence = input_dict[cur_id][0][0] cur_scores = input_dict[cur_id][1] # write entry id current_output.write(f'{cur_id}\n') # for each residue write the position, residue, score, and classification for res_and_score_index in range(0, len(cur_sequence)): cur_residue = cur_sequence[res_and_score_index] cur_score = cur_scores[res_and_score_index] cur_binary = get_binary_prediction(cur_score, cutoff_value=0.5) # write as tsv the caid formatted info current_output.write( f'{res_and_score_index+1}\t{cur_residue}\t{cur_score}\t{cur_binary}\n' )
def seq_from_name(name): ''' Function to get the sequence of a protein from the name. Parameters ---------- name: string A string that carries the details fo the protein to search for. Can contain the name of the protein as well as the name of the organims. ex. ARF19 Arabidopsis ARF19 p53 Human p53 H**o sapiens p53 Returns ------- top_hit : string Returns the amino acid sequence of the top hit on uniprot website. ''' # first format name into a url # uses only reviewed name = name.split(' ') if len(name) == 1: # this url does not filter for the reviewed proteins # leaving as a backup # use_url = f'https://www.uniprot.org/uniprot/?query={name[0]}&sort=score' use_url = f'https://www.uniprot.org/uniprot/?query={name[0]}&fil=reviewed%3Ayes&sort=score' else: add_str = '' for i in name: add_str += i add_str += '%20' add_str = add_str[0:len(add_str) - 3] # this url does not filter for the reviewed proteins # leaving as a backup #use_url = f'https://www.uniprot.org/uniprot/?query={add_str}&sort=score' # one below filters for the reviewed proteins. use_url = f'https://www.uniprot.org/uniprot/?query={add_str}&fil=reviewed%3Ayes&sort=score' # set http http = urllib3.PoolManager() # get r r = http.request('GET', use_url) if b'Sorry, no results found for your search term.' in r.data: if len(name) == 1: # this url does not filter for the reviewed proteins use_url = f'https://www.uniprot.org/uniprot/?query={name[0]}&sort=score' else: add_str = '' for i in name: add_str += i add_str += '%20' add_str = add_str[0:len(add_str) - 3] # this url does not filter for the reviewed proteins use_url = f'https://www.uniprot.org/uniprot/?query={add_str}&sort=score' # set http http = urllib3.PoolManager() # get r r = http.request('GET', use_url) if b'Sorry, no results found for your search term.' in r.data: raise MetapredictError( 'Sorry! We were not able to find the protein corresponding to that name.' ) # now that the url is figured out and the data fetched, parse it to get the uniprot ids. parsed_data = r.data.split(b'checkbox_') # take the top uniprot ID from the page first_hit = str(parsed_data[1])[2:] # now format the top hit so it is just the uniprot ID top_hit = (first_hit.split('"')[0]) org = first_hit.split('taxonomy') organism_name = (org[1].split('>')[1].split('<')[0]) organism_name = organism_name.split() final_name = '' for val in organism_name: final_name += val final_name += '_' final_name = final_name[:len(final_name) - 1] # return the top hit as a list where the first element is the # uniprot ID and the second element is the sequence return fetch_sequence(top_hit, return_full_id=True)
def meta_predict(sequence, normalized=True, network=brnn_network, device=device, encoding_scheme=encoding_scheme): """ The actual executing function for predicting the disorder of a sequence using metapredict. Returns a list containing predicted disorder values for the input sequence. Parameters: ------------ sequence : str The amino acid sequence to be predicted normalized : bool Flag which defines if normalization should occur or not. By default, negative values are set to be equal to 0 and values greater than 1 are set to be equal to 1. User can set normalized=False to get raw prediction values. Default = True network : Pytorch network Defines the Pytorch network to be used. Alternative networks can provided in principle, but in practice metapredict has been trained on a specific network. Default = network loaded by metapredict. device : str String describing where the network is physically stored on the computer. Should be either 'cpu' or 'cuda' (GPU). Default = 'cpu' encoding_scheme : str String that defines the encoding scheme used when metapredict was trained. The encoding scheme used in the default implementation was 'onehot'. Default='onehot'. Returns: ---------- list Returns a list with a per-residue disorder score. The list length will match the length of the input sequence. """ # set seq_vector equal to converted amino acid sequence that is a PyTorch tensor of one-hot vectors if encoding_scheme == 'onehot': seq_vector = encode_sequence.one_hot(sequence) else: raise MetapredictError( 'fCannot understand encoding scheme [{encoding_scheme}]') seq_vector = seq_vector.view(1, len(seq_vector), -1) # get output values from the seq_vector based on the network (brnn_network) outputs = network(seq_vector.float()).detach().numpy()[0] # make empty list to add in outputs output_values = [] # for the values 'i' in outputs for i in outputs: # append each value (which is the predicted disorder value) to output values as a float. # round each value to six digits. output_values.append(round(float(i), 4)) # if normalized=True (defualt) if normalized == True: # initialize empty list to populate normalized values normalized_IDR_values = [] # determine the lowest value in the output_values list min_IDR = min(output_values) # if the lowset value is less than 0, normalize the list by replacing negative values with 0. if min_IDR < 0: for j in range(0, len(output_values)): cur_value = output_values[j] if cur_value < 0: normalized_IDR_values.append(0) else: normalized_IDR_values.append(round(cur_value, 4)) # overwrite output_values with normalized_IDR_values (which are now all non-negative). output_values = normalized_IDR_values # overwrite normalized_IDR_values with an empty list normalized_IDR_values = [] # determine the greatest value in the ouputValues list max_IDR = max(output_values) # if the greatest value is greater than 1, replace values greater than 1 with 1. if max_IDR > 1: for k in range(0, len(output_values)): cur_value = output_values[k] if cur_value > 1: normalized_IDR_values.append(1) else: normalized_IDR_values.append(round(cur_value, 4)) # overwrite output_values with normalized_IDR_values (which are now all less than or equal to 1). output_values = normalized_IDR_values # return output_values return output_values # if normalized=False, just return the output_values. else: return output_values
def graph(sequence, title='Predicted protein disorder', disorder_threshold=None, pLDDT_scores=False, disorder_scores=True, shaded_regions=None, shaded_region_color='red', disorder_line_color='blue', threshold_line_color='black', confidence_line_color='darkorange', confidence_threshold_color='black', DPI=150, output_file=None, legacy_metapredict=False): """ Function for graphing predicted disorder. By default, this function will show a graph. However, you can specify output_file as the file path followed by the name of the saved file with the proper extension (.png by default). This is the backend for the meta.py graphing functions. Parameters ----------- sequence : str Input amino acid sequence (as string) to be predicted. title : str Sets the title of the generated figure. Default = "Predicted protein disorder" pLDDT_scores : Bool Sets whether to include the predicted confidence scores from AlphaFold2. Default = False disorder_scores : Bool Whether to include disorder scores. Can set to False if you just want the AF2 confidence scores. Default = True disorder_threshold : float Sets a threshold which draws a horizontal black line as a visual guide along the length of the figure. Must be a value between 0 and 1. Default = 0.3 shaded_regions : list of lists A list of lists, where sub-elements are of length 2 and contain start and end values for regions to be shaded. Assumes that sanity checking on positions has already been done. Default = None. shaded_region_color : str or list of strs String or list of strings that defines the color of the shaded region. region is always set with an alpha of 0.3 but the color can be any valid matplotlib color name or a hex color string (i.e. "#ff0000" is red). If a single string or a list of length 1 is passed then the color defined by that string (or the single element) is used. If a list if length = len(shaded_regions) is passed, then EACH shaded region is colored according to the correspondingly-indexed list element. If a mismatch between number of elements in shaded_region and shaded_region_color is found an exception is rasied. Disorder_line_color : str String that defines the color of the traced disorder score. Can be any standard matplotlib color name or a hex-value (see above). Default = 'blue'. threshold_line_color : str String that defines the color of the traced disorder score. Can be any standard matplotlib color name or a hex-value (see above). Default = 'black'. DPI : int Dots-per-inch. Defines the resolution of the generated .png figure. Note that if an alternative filetype is pathed the matplotlib backened will automatically generate a file of the relevant type (e.g. .pdf, .jpg, or .eps). output_file : str If provided, the output_file variable defines the location and type of the file to be saved. This should be a file location and filename with a valid matplotlib extension (such as .png, .jpg, .pdf) and, if provided, this value is passed directly to the ``matplotlib.pyplot.savefig()`` function as the ``fname`` parameter. Default = None. legacy_metapredict : bool Whether or not to use the original version of metapredict for predicting disorder values. Returns ----------- None No return type, but will either generate an on-screen plot OR will save a file to disk, depending on if output_file is provided (or not). """ # make sure confidence scores and disorder scores not both false if pLDDT_scores == False and disorder_scores == False: raise MetapredictError( 'Cannot set both pLDDT_scores and disorder_scores to False. If disorder_scores=False, set confidence_score=True.' ) # if confidence scores also added, match the threshold_line_color to the # disorder_line_color if pLDDT_scores == True and disorder_scores == True: threshold_line_color = disorder_line_color confidence_threshold_color = confidence_line_color # set this such that PDF-generated figures become editable matplotlib.rcParams['pdf.fonttype'] = 42 matplotlib.rcParams['ps.fonttype'] = 42 #set n_res to lenght of seq n_res = len(sequence) # set yValues equal to the predicted disorder from the sequence (normalized) if disorder_scores == True: if legacy_metapredict == True: yValues = legacy_predict(sequence) if disorder_threshold == None: disorder_threshold = 0.3 else: yValues = metameta_predict(sequence) if disorder_threshold == None: disorder_threshold = 0.5 # if a name is set, the figure will hold that name as the identifier if pLDDT_scores == True and disorder_scores == True: fig = plt.figure(num=title, figsize=[11, 3], dpi=DPI, edgecolor='black') #axes = fig.add_axes([0.15, 0.15, 0.55, 0.75]) axes = fig.add_axes([0.1, 0.15, 0.55, 0.75]) else: fig = plt.figure(num=title, figsize=[8, 3], dpi=DPI, edgecolor='black') axes = fig.add_axes([0.15, 0.15, 0.75, 0.75]) # set x label axes.set_xlabel("Residue") # if default title is used if title == 'Predicted protein disorder': # if user doesn't set title and confidence scores # are added in, change default to include AF2pLDDT if pLDDT_scores == True and disorder_scores == True: title = 'Predicted protein disorder / AF2pLDDT' # if user doesn't set title and only wants confidence scores elif pLDDT_scores == True and disorder_scores == False: title = 'Predicted protein AF2pLDDT scores' else: title = title # set the title axes.set_title(title) # modify y_label if needed if pLDDT_scores == True and disorder_scores == False: axes.set_ylabel("AF2 ppLDDT scores") else: axes.set_ylabel("Consensus Disorder") # make x values for each residue with predicted disorder xValues = np.arange(1, n_res + 1) # graph the disorder values of each residue at each point along the x-axis if disorder_scores == True: ds1, = axes.plot(xValues, yValues, color=disorder_line_color, linewidth='1.6', label='Disorder Scores') # set x limit as the number of residues axes.set_xlim(1, n_res + 1) # set y limit as 0-1 since the predictor data is normalized from 0 to 1. if disorder_scores == True: # set ylim axes.set_ylim(-0.003, 1.003) # plot the disorder cutoff threshold h if pLDDT_scores == True: ds2, = axes.plot([0, n_res + 2], [disorder_threshold, disorder_threshold], color=threshold_line_color, linewidth="1.25", linestyle=(0, (5, 5)), label='Disorder Threshold') else: ds2, = axes.plot([0, n_res + 2], [disorder_threshold, disorder_threshold], color=threshold_line_color, linewidth="1.25", linestyle="dashed", label='Disorder Threshold') # add dashed lines at 0.2 intervals if cutoff lines not specified for i in [0.2, 0.4, 0.6, 0.8]: axes.plot([0, n_res + 2], [i, i], color="black", linestyle="dashed", linewidth="0.5") else: # if it will just be confidence scores, set to 0 to 100 axes.set_ylim(0, 100) # plot threshold ds2, = axes.plot([0, n_res + 2], [50, 50], color=confidence_threshold_color, linewidth="1.25", linestyle="dashed", label='Confidence Threshold') # add dashed lines at 0.2 intervals if cutoff lines not specified for i in [20, 40, 60, 80]: axes.plot([0, n_res + 2], [i, i], color="black", linestyle="dashed", linewidth="0.5") # make sure the shaded_region_color variable makes snese if type(shaded_region_color) != list: if type(shaded_region_color) == str: shaded_region_color = [shaded_region_color] else: raise MetapredictError( 'Invalid type passed as shaded_region_color. Expect a list of colors or a string' ) else: if len(shaded_region_color) == 1: pass elif len(shaded_region_color) == len(shaded_regions): pass else: raise MetapredictError( 'Invalid number of colors passed. If a list is used for shaded_region_color, then the number of elements must be either 1 OR equal the number of shaded regions' ) # if we want shaded regions if shaded_regions is not None: for boundary in range(0, len(shaded_regions)): cur_boundary = shaded_regions[boundary] start = cur_boundary[0] end = cur_boundary[1] # if we had multiple shaded regions if len(shaded_region_color) == len(shaded_regions): cur_color = shaded_region_color[boundary] else: cur_color = shaded_region_color[0] axes.axvspan(start, end, alpha=0.2, color=cur_color, linewidth=0) # if graphing both confidence and disorder if pLDDT_scores == True and disorder_scores == True: # import alpha predict from alphaPredict import alpha # get confidence scores pLDDT_scores = alpha.predict(sequence) twin1 = axes.twinx() af1, = twin1.plot(xValues, pLDDT_scores, color=confidence_line_color, label="Predicted AF2pLDDT") twin1.set_ylim(0, 100) twin1.set_ylabel('Predicted AF2pLDDT Scores') af2, = axes.plot([0, n_res + 2], [0.5, 0.5], color=confidence_line_color, linewidth="1.25", linestyle=(5, (5, 5)), label='AF2pLDDT Threshold') axes.legend(handles=[ds1, ds2, af1, af2], bbox_to_anchor=(1.14, 1), loc='best', prop={'size': 12}) elif pLDDT_scores == True and disorder_scores == False: # import alpha predict from alphaPredict import alpha # get confidence scores pLDDT_scores = alpha.predict(sequence) # plot the confidence scores axes.plot(xValues, pLDDT_scores, color=confidence_line_color, linewidth='1.6', label='Disorder Scores') if output_file is None: plt.show() else: plt.savefig(fname=output_file, dpi=DPI) plt.close()