예제 #1
0
def valid_shaded_region(shaded_regions, n_res):
    """
    Function that ensures that the passed shaded region are readable and make sense.

    Parameters
    ------------
    shaded_regions : list of lists
        A list of lists, where sub-elements are of length 2 and contain start and end
        values for regions to be shaded. Assumes that sanity checking on positions has
        already been done. Default is None, but if there were specific regions you wanted
        to highlight this might, for example, look like shaded_regions=[[1,10],[40,50]], 
        which would shade between 1 and 10 and then between 40 and 50. This can be useful
        to either highlight specific IDRs or specific folded domains

    Returns
    ---------
    None 
        No return type but will raise various possible exceptions in a structured way if
        the shaded regions info is not parseable

    Raises
    --------
    MetapredictError

    """

    if shaded_regions is None:
        return

    # check shaded regions make sense:
    try:
        for b in shaded_regions:

            if b[0] < 1 or b[0] > n_res + 1:
                raise MetapredictError(
                    f'Invalid start position in shaded_regions: {b[0]}')

            if b[1] < 1 or b[1] > n_res + 1:
                raise MetapredictError(
                    f'Invalid end position in shaded_regions: {b[0]}')

    except Exception as e:
        raise MetapredictError(
            'Error in parsing shaded_regions - full error below\n\n%s' %
            (str(e)))
예제 #2
0
def fetch_sequence(uniprot_id, return_full_id=False):
    """
    Function that returns the amino acid sequence by polling UniProt.com

    Note that right now the test for success is a bit hap-hazard (looks for the
    string "Sorry", which appears if the UniProt call fails. We probably want
    something a bit more robust in the future...

    Parameters
    --------------
    uniprot_id : str
        Uniprot accession number

    return_full_id : bool
        Whether to return the full uniprot ID. If set to True,
        returns a list where the first element is the full uniprot ID, the
        second element is the sequence, and the third element is
        the short uniprot ID.

    Returns
    -----------
    str or None:
        If the call is succesfull, this returns the amino acid string. If not, it returns
        None. 

    """

    http = urllib3.PoolManager()
    r = http.request('GET',
                     'https://www.uniprot.org/uniprot/%s.fasta' % (uniprot_id))

    y = "".join(str(r.data).split('\\n')[:1]).replace("'", "")[1:]

    s = "".join(str(r.data).split('\\n')[1:]).replace("'", "")

    # make sure that the last character is not a " due to a ' in protein name
    # Thank you to Github user keithchev for pointing out this bug!
    if s[len(s) - 1] == '"':
        s = s[:len(s) - 1]

    if s.find('Sorry') > -1:
        raise MetapredictError(
            'Error: unable to fetch UniProt sequence with accession %s' %
            (uniprot_id))

    if return_full_id == False:
        return s

    else:
        return [y, s, uniprot_id]
예제 #3
0
def write_csv(input_dict, output_file):
    """
    Function that writes the scores in an input dictionary out to a standardized CVS file format.

    Parameters
    -----------
    input_dict : dict
        Dictionary where keys are headers/identifiers and values is a list of per-residue
        disorder score

    output_file : str
        Location and filename for the output file. Assumes .csv is provided.

    Returns
    --------
    None
        No return value, but writes a .csv file to disk


    """

    # try and open the file and throw exception if anything goes wrong
    try:
        fh = open(output_file, 'w')
    except Exception:
        raise MetapredictError('Unable to write to file destination %s' %
                               (output_file))

    # for each entry
    for idx in input_dict:

        # important otherwise commmas in FASTA headers render the CSV file unreadable!
        no_comma = idx.replace(',', ' ')
        fh.write('%s' % (no_comma))

        # for each score write
        for score in input_dict[idx]:
            fh.write(', %1.3f' % (score))
        fh.write('\n')
예제 #4
0
def validate_options(option, valid_list):
    """
    Function that raises an exception if $option is not found in $valid_list.

    Parameters
    ----------
    option : str
        Option that has been passed

    valid_list : str
        List of strings which we expect $option to be one of

    Returns
    ---------
    None 
        No return type by riases MetapredictError if option not found in valid_list


    """
    if option not in valid_list:
        raise MetapredictError(
            'Expected one of %s but only option passed was %s' %
            (str(valid_list), option))
예제 #5
0
def valid_range(inval, minval, maxval):
    if inval < minval or inval > maxval:
        raise MetapredictError(
            'Value %1.3f is outside of range [%1.3f, %1.3f]' %
            (inval, minval, maxval))
예제 #6
0
def write_caid_format(input_dict, output_file):
    '''
    Function that takes in a dictionary and outputs a file in the format as 
    specified by IDPcentrail Critical Assessment of Intrinsic protein Disorder
    (CAID). Format is as follows - 
        ouptut is a plain text output where the
        prediction has an entry header >entry_id header, similar to the beginning
        of a .fasta file
        Every line following the entry_id contains tab separated columns with columns
        ordered as follows - 1) residue number, 2) residue name, 3) confidence score,
        4) binary classification where 1 = disordered and 0 = not disordered.

    Example (from idpcentral.org/caid):
        >P04637
        1    M    0.892    1
        2    E    0.813    1

    Parameters
    ----------
    input_dict : dict
        input dictionary of disorder scores. The Key should be the
        entry_id as as string and the associated value should be a list where the
        first element of the list is the corresponding sequence as a string and the
        second item of the list is the corresponding predictions as float values.

    Returns
    -------
    None
        Does not return anything to the user. Writes a file saved to either
        the current directory or to a specified file path.

    '''

    # first make a list of all of the keys in the dict
    entry_ids = []

    for entry_id in input_dict.keys():
        entry_ids.append(entry_id)

    # attempt to write to output file, raise MetapredictError if unable to
    try:
        current_output = open(output_file, 'w')
    except Exception:
        raise MetapredictError(f'Unable to write to {output_file}')

    # now iterate through the dict and append the necessary values per line
    for ids in entry_ids:
        cur_id = ids
        cur_sequence = input_dict[cur_id][0][0]
        cur_scores = input_dict[cur_id][1]
        # write entry id
        current_output.write(f'{cur_id}\n')

        # for each residue write the position, residue, score, and classification
        for res_and_score_index in range(0, len(cur_sequence)):
            cur_residue = cur_sequence[res_and_score_index]
            cur_score = cur_scores[res_and_score_index]
            cur_binary = get_binary_prediction(cur_score, cutoff_value=0.5)
            # write as tsv the caid formatted info
            current_output.write(
                f'{res_and_score_index+1}\t{cur_residue}\t{cur_score}\t{cur_binary}\n'
            )
예제 #7
0
def seq_from_name(name):
    '''
    Function to get the sequence of a protein from the name. 

    Parameters
    ----------
    name: string
        A string that carries the details fo the protein to search for. Can 
        contain the name of the protein as well as the name of the organims.
            ex. ARF19
                Arabidopsis ARF19

                p53
                Human p53
                H**o sapiens p53


    Returns
    -------
    top_hit : string
        Returns the amino acid sequence of the top hit on uniprot
        website.
    '''

    # first format name into a url
    # uses only reviewed
    name = name.split(' ')
    if len(name) == 1:
        # this url does not filter for the reviewed proteins
        # leaving as a backup
        # use_url = f'https://www.uniprot.org/uniprot/?query={name[0]}&sort=score'

        use_url = f'https://www.uniprot.org/uniprot/?query={name[0]}&fil=reviewed%3Ayes&sort=score'

    else:
        add_str = ''
        for i in name:
            add_str += i
            add_str += '%20'
        add_str = add_str[0:len(add_str) - 3]
        # this url does not filter for the reviewed proteins
        # leaving as a backup
        #use_url = f'https://www.uniprot.org/uniprot/?query={add_str}&sort=score'

        # one below filters for the reviewed proteins.
        use_url = f'https://www.uniprot.org/uniprot/?query={add_str}&fil=reviewed%3Ayes&sort=score'

    # set http
    http = urllib3.PoolManager()
    # get r
    r = http.request('GET', use_url)

    if b'Sorry, no results found for your search term.' in r.data:
        if len(name) == 1:
            # this url does not filter for the reviewed proteins
            use_url = f'https://www.uniprot.org/uniprot/?query={name[0]}&sort=score'

        else:
            add_str = ''
            for i in name:
                add_str += i
                add_str += '%20'
            add_str = add_str[0:len(add_str) - 3]
            # this url does not filter for the reviewed proteins
            use_url = f'https://www.uniprot.org/uniprot/?query={add_str}&sort=score'

        # set http
        http = urllib3.PoolManager()
        # get r
        r = http.request('GET', use_url)

        if b'Sorry, no results found for your search term.' in r.data:
            raise MetapredictError(
                'Sorry! We were not able to find the protein corresponding to that name.'
            )

    # now that the url is figured out and the data fetched, parse it to get the uniprot ids.
    parsed_data = r.data.split(b'checkbox_')
    # take the top uniprot ID from the page
    first_hit = str(parsed_data[1])[2:]
    # now format the top hit so it is just the uniprot ID
    top_hit = (first_hit.split('"')[0])
    org = first_hit.split('taxonomy')
    organism_name = (org[1].split('>')[1].split('<')[0])
    organism_name = organism_name.split()
    final_name = ''
    for val in organism_name:
        final_name += val
        final_name += '_'
    final_name = final_name[:len(final_name) - 1]

    # return the top hit as a list where the first element is the
    # uniprot ID and the second element is the sequence
    return fetch_sequence(top_hit, return_full_id=True)
예제 #8
0
def meta_predict(sequence,
                 normalized=True,
                 network=brnn_network,
                 device=device,
                 encoding_scheme=encoding_scheme):
    """
    The actual executing function for predicting the disorder of a sequence using metapredict.
    Returns a list containing predicted disorder values for the input sequence. 

    Parameters:
    ------------
    sequence : str
        The amino acid sequence to be predicted

    normalized : bool
        Flag which defines if normalization should occur or not. By default,
        negative values are set to be equal to 0 and values greater than 1 
        are set to be equal to 1. User can set normalized=False to get raw 
        prediction values.
        Default = True

    network : Pytorch network 
        Defines the Pytorch network to be used. Alternative networks can
        provided in principle, but in practice metapredict has been trained
        on a specific network. Default = network loaded by metapredict.
      
    device : str
        String describing where the network is physically stored on the computer. 
        Should be either 'cpu' or 'cuda' (GPU). Default = 'cpu'

    encoding_scheme : str
        String that defines the encoding scheme used when metapredict was 
        trained. The encoding scheme used in the default implementation 
        was 'onehot'. Default='onehot'.

    Returns:
    ----------
    list
        Returns a list with a per-residue disorder score. The list length
        will match the length of the input sequence.
    
    """

    # set seq_vector equal to converted amino acid sequence that is a PyTorch tensor of one-hot vectors
    if encoding_scheme == 'onehot':
        seq_vector = encode_sequence.one_hot(sequence)
    else:
        raise MetapredictError(
            'fCannot understand encoding scheme [{encoding_scheme}]')

    seq_vector = seq_vector.view(1, len(seq_vector), -1)

    # get output values from the seq_vector based on the network (brnn_network)
    outputs = network(seq_vector.float()).detach().numpy()[0]

    # make empty list to add in outputs
    output_values = []
    # for the values 'i' in outputs
    for i in outputs:
        # append each value (which is the predicted disorder value) to output values as a float.
        # round each value to six digits.
        output_values.append(round(float(i), 4))

    # if normalized=True (defualt)
    if normalized == True:
        # initialize empty list to populate normalized values
        normalized_IDR_values = []
        # determine the lowest value in the output_values list
        min_IDR = min(output_values)
        # if the lowset value is less than 0, normalize the list by replacing negative values with 0.
        if min_IDR < 0:
            for j in range(0, len(output_values)):
                cur_value = output_values[j]
                if cur_value < 0:
                    normalized_IDR_values.append(0)
                else:
                    normalized_IDR_values.append(round(cur_value, 4))
            # overwrite output_values with normalized_IDR_values (which are now all non-negative).
            output_values = normalized_IDR_values
        # overwrite normalized_IDR_values with an empty list
        normalized_IDR_values = []
        # determine the greatest value in the ouputValues list
        max_IDR = max(output_values)
        # if the greatest value is greater than 1, replace values greater than 1 with 1.
        if max_IDR > 1:
            for k in range(0, len(output_values)):
                cur_value = output_values[k]
                if cur_value > 1:
                    normalized_IDR_values.append(1)
                else:
                    normalized_IDR_values.append(round(cur_value, 4))
            # overwrite output_values with normalized_IDR_values (which are now all less than or equal to 1).
            output_values = normalized_IDR_values
        # return output_values
        return output_values
    # if normalized=False, just return the output_values.
    else:
        return output_values
예제 #9
0
def graph(sequence,
          title='Predicted protein disorder',
          disorder_threshold=None,
          pLDDT_scores=False,
          disorder_scores=True,
          shaded_regions=None,
          shaded_region_color='red',
          disorder_line_color='blue',
          threshold_line_color='black',
          confidence_line_color='darkorange',
          confidence_threshold_color='black',
          DPI=150,
          output_file=None,
          legacy_metapredict=False):
    """
    Function for graphing predicted  disorder. By default, this function will show a graph.
    However, you can specify output_file as the
    file path followed by the name of the saved file with the proper extension (.png by default).
    This is the backend for the meta.py graphing functions.
    
    Parameters
    -----------
    sequence : str 
        Input amino acid sequence (as string) to be predicted.

    title : str
        Sets the title of the generated figure. 
        Default = "Predicted protein disorder"
        
    pLDDT_scores : Bool
        Sets whether to include the predicted confidence scores from
        AlphaFold2. Default = False

    disorder_scores : Bool
        Whether to include disorder scores. Can set to False if you
        just want the AF2 confidence scores. Default = True

    disorder_threshold : float
        Sets a threshold which draws a horizontal black line as a visual
        guide along the length of the figure. Must be a value between 0 
        and 1. Default = 0.3
        
    shaded_regions : list of lists
        A list of lists, where sub-elements are of length 2 and contain 
        start and end values for regions to be shaded. Assumes that sanity 
        checking on positions has already been done. Default = None.

    shaded_region_color : str or list of strs
        String or list of strings that defines the color of the shaded region. 
        region is always set with an alpha of 0.3 but the color can be 
        any valid matplotlib color name or a hex color string (i.e. "#ff0000" 
        is red). If a single string or a list of length 1 is passed then the
        color defined by that string (or the single element) is used. If a list
        if length = len(shaded_regions) is passed, then EACH shaded region is 
        colored according to the correspondingly-indexed list element. If a 
        mismatch between number of elements in shaded_region and 
        shaded_region_color is found an exception is rasied.

    Disorder_line_color : str
        String that defines the color of the traced disorder score.  Can
        be any standard matplotlib color name or a hex-value (see above). 
        Default = 'blue'.

    threshold_line_color : str
        String that defines the color of the traced disorder score. Can
        be any standard matplotlib color name or a hex-value (see above). 
        Default = 'black'.

    DPI : int
        Dots-per-inch. Defines the resolution of the generated .png figure.
        Note that if an alternative filetype is pathed the matplotlib 
        backened will automatically generate a file of the relevant type (e.g. 
       .pdf, .jpg, or .eps).
        
        
    output_file : str
        If provided, the output_file variable defines the location and type 
        of the file to be saved. This should be a file location and filename 
        with a valid matplotlib extension (such as .png, .jpg, .pdf) and, if 
        provided, this value is passed directly to the 
        ``matplotlib.pyplot.savefig()`` function as the ``fname`` parameter. 
        Default = None.

    legacy_metapredict : bool
        Whether or not to use the original version of metapredict for
        predicting disorder values.
        

    Returns
    -----------
    None 
        No return type, but will either generate an on-screen plot OR will 
        save a file to disk, depending on if output_file is provided (or not).
        
    """

    # make sure confidence scores and disorder scores not both false
    if pLDDT_scores == False and disorder_scores == False:
        raise MetapredictError(
            'Cannot set both pLDDT_scores and disorder_scores to False. If disorder_scores=False, set confidence_score=True.'
        )

    # if confidence scores also added, match the threshold_line_color to the
    # disorder_line_color
    if pLDDT_scores == True and disorder_scores == True:
        threshold_line_color = disorder_line_color
        confidence_threshold_color = confidence_line_color

    # set this such that PDF-generated figures become editable
    matplotlib.rcParams['pdf.fonttype'] = 42
    matplotlib.rcParams['ps.fonttype'] = 42

    #set n_res to lenght of seq
    n_res = len(sequence)

    # set yValues equal to the predicted disorder from the sequence (normalized)
    if disorder_scores == True:
        if legacy_metapredict == True:
            yValues = legacy_predict(sequence)
            if disorder_threshold == None:
                disorder_threshold = 0.3
        else:
            yValues = metameta_predict(sequence)
            if disorder_threshold == None:
                disorder_threshold = 0.5

    # if a name is set, the figure will hold that name as the identifier
    if pLDDT_scores == True and disorder_scores == True:
        fig = plt.figure(num=title,
                         figsize=[11, 3],
                         dpi=DPI,
                         edgecolor='black')
        #axes = fig.add_axes([0.15, 0.15, 0.55, 0.75])
        axes = fig.add_axes([0.1, 0.15, 0.55, 0.75])
    else:
        fig = plt.figure(num=title, figsize=[8, 3], dpi=DPI, edgecolor='black')
        axes = fig.add_axes([0.15, 0.15, 0.75, 0.75])

    # set x label
    axes.set_xlabel("Residue")

    # if default title is used
    if title == 'Predicted protein disorder':
        # if user doesn't set title and confidence scores
        # are added in, change default to include AF2pLDDT
        if pLDDT_scores == True and disorder_scores == True:
            title = 'Predicted protein disorder / AF2pLDDT'
        # if user doesn't set title and only wants confidence scores
        elif pLDDT_scores == True and disorder_scores == False:
            title = 'Predicted protein AF2pLDDT scores'
        else:
            title = title

    # set the title
    axes.set_title(title)

    # modify y_label if needed
    if pLDDT_scores == True and disorder_scores == False:
        axes.set_ylabel("AF2 ppLDDT scores")
    else:
        axes.set_ylabel("Consensus Disorder")

    # make x values for each residue with predicted disorder
    xValues = np.arange(1, n_res + 1)

    # graph the disorder values of each residue at each point along the x-axis
    if disorder_scores == True:
        ds1, = axes.plot(xValues,
                         yValues,
                         color=disorder_line_color,
                         linewidth='1.6',
                         label='Disorder Scores')

    # set x limit as the number of residues
    axes.set_xlim(1, n_res + 1)

    # set y limit as 0-1 since the predictor data is normalized from 0 to 1.
    if disorder_scores == True:

        # set ylim
        axes.set_ylim(-0.003, 1.003)

        # plot the disorder cutoff threshold h
        if pLDDT_scores == True:
            ds2, = axes.plot([0, n_res + 2],
                             [disorder_threshold, disorder_threshold],
                             color=threshold_line_color,
                             linewidth="1.25",
                             linestyle=(0, (5, 5)),
                             label='Disorder Threshold')
        else:
            ds2, = axes.plot([0, n_res + 2],
                             [disorder_threshold, disorder_threshold],
                             color=threshold_line_color,
                             linewidth="1.25",
                             linestyle="dashed",
                             label='Disorder Threshold')

        # add dashed lines at 0.2 intervals if cutoff lines not specified
        for i in [0.2, 0.4, 0.6, 0.8]:
            axes.plot([0, n_res + 2], [i, i],
                      color="black",
                      linestyle="dashed",
                      linewidth="0.5")

    else:

        # if it will just be confidence scores, set to 0 to 100
        axes.set_ylim(0, 100)

        # plot threshold
        ds2, = axes.plot([0, n_res + 2], [50, 50],
                         color=confidence_threshold_color,
                         linewidth="1.25",
                         linestyle="dashed",
                         label='Confidence Threshold')

        # add dashed lines at 0.2 intervals if cutoff lines not specified
        for i in [20, 40, 60, 80]:
            axes.plot([0, n_res + 2], [i, i],
                      color="black",
                      linestyle="dashed",
                      linewidth="0.5")

    # make sure the shaded_region_color variable makes snese
    if type(shaded_region_color) != list:
        if type(shaded_region_color) == str:
            shaded_region_color = [shaded_region_color]
        else:
            raise MetapredictError(
                'Invalid type passed as shaded_region_color. Expect a list of colors or a string'
            )
    else:
        if len(shaded_region_color) == 1:
            pass
        elif len(shaded_region_color) == len(shaded_regions):
            pass
        else:
            raise MetapredictError(
                'Invalid number of colors passed. If a list is used for shaded_region_color, then the number of elements must be either 1 OR equal the number of shaded regions'
            )

    # if we want shaded regions
    if shaded_regions is not None:
        for boundary in range(0, len(shaded_regions)):

            cur_boundary = shaded_regions[boundary]
            start = cur_boundary[0]
            end = cur_boundary[1]

            # if we had multiple shaded regions
            if len(shaded_region_color) == len(shaded_regions):
                cur_color = shaded_region_color[boundary]
            else:
                cur_color = shaded_region_color[0]
            axes.axvspan(start, end, alpha=0.2, color=cur_color, linewidth=0)

    # if graphing both confidence and disorder
    if pLDDT_scores == True and disorder_scores == True:

        # import alpha predict
        from alphaPredict import alpha

        # get confidence scores
        pLDDT_scores = alpha.predict(sequence)

        twin1 = axes.twinx()
        af1, = twin1.plot(xValues,
                          pLDDT_scores,
                          color=confidence_line_color,
                          label="Predicted AF2pLDDT")
        twin1.set_ylim(0, 100)
        twin1.set_ylabel('Predicted AF2pLDDT Scores')
        af2, = axes.plot([0, n_res + 2], [0.5, 0.5],
                         color=confidence_line_color,
                         linewidth="1.25",
                         linestyle=(5, (5, 5)),
                         label='AF2pLDDT Threshold')
        axes.legend(handles=[ds1, ds2, af1, af2],
                    bbox_to_anchor=(1.14, 1),
                    loc='best',
                    prop={'size': 12})

    elif pLDDT_scores == True and disorder_scores == False:

        # import alpha predict
        from alphaPredict import alpha

        # get confidence scores
        pLDDT_scores = alpha.predict(sequence)

        # plot the confidence scores
        axes.plot(xValues,
                  pLDDT_scores,
                  color=confidence_line_color,
                  linewidth='1.6',
                  label='Disorder Scores')

    if output_file is None:
        plt.show()
    else:
        plt.savefig(fname=output_file, dpi=DPI)
        plt.close()