def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the followin format:

        Unique_ID, key1:value1, key2:value2, ..., keyn:valuen

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2ADs = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            sline = line.strip().split(delimiter)

            # try
            try:
                unique_ID = sline[0].strip()
                attributes = {}
            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 1:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[1:], filename, linecount, line)
            else:
                # skip over empty entries
                continue

            if unique_ID in ID2ADs:
                ID2ADs[unique_ID].append(attributes)
            else:
                ID2ADs[unique_ID] = [attributes]

        self.data = ID2ADs
示例#2
0
def parse_key_value_pairs(split_line, filename, linecount, line):
    """
    Helper function for parsing input files that have attributes (for Domains and Sites).

    The function takes in a list of key-value pairs (where key-values are split by a ':'
    symbol) and returns a parsed dictionary. Note that values will always be strings.

    Parameters
    -------------
    split_line : list of strings
        Each string in the list should have the format <KEY> : <VALUE> 

    filename : string
        Name of the file the calling function is parsing. Only used when raising an exception.

    linecount : int
        Current line number the file processing is on. Only used when raising an exception.

    line : string
        Full line that the file processing is on. Again, only used when raising an exception

    """

    attributes = {}

    for idx in range(len(split_line)):
        try:
            sentry = split_line[idx].split(':')
        except Exception:
                
            # should update this to also display the actual error...
            raise InterfaceException('Failed parsing key-value pairs in file [%s] on line [%i]... line printed below:\n%s'%(filename, linecount, line))
            
        k = sentry[0].strip()
        v = sentry[1].strip()
                        
        attributes[k] = v

    return attributes
            
示例#3
0
def check_proteome(p, function_name):
    """
    Function that takes takes in some object and tests if its a Proteome object (or not).
    If yes returns None, but if no raise an exception with an error message that includes the
    name of the parent function that is calling this validation function

    Parameters
    --------------
    p : Proteome object 
        Unknown object that will be tested to see if it's a Proteome object or not

    function_name : string
        Passed string that makes it easy for the user to debug which function has failed

    Returns
    ---------
    No returns but the function will raise an exception if the passed object is not a Proteome
    object

    """
    if "<class 'shephard.proteome.Proteome'>" == str(p.__class__):
        return None
    else:
        raise InterfaceException('First argument passed to function [%s] was not a proteome' %(function_name))
示例#4
0
def apply_track_residue_density(proteome,
                                residue_set,
                                name='residue_density',
                                block_size=30,
                                safe=True):
    """
    
    This is an apply_track_* function that generates a values-track that describes the local density of amino 
    acids along the sequence. The set of amino acids to be used is defined by the residue_set, 
    while name and block_size are optional parameters that define different things.
    
    Density is calculated by computing the number of residues within a block_size subsequence and dividing by
    the block_size. This yields a value between 0 and 1.

    NB: If a protein sequence is shorter than the block_size then that protein gets skipped. 

    Parameters
    ------------------

    proteome : proteome object
        This is the proteome object, and for each protein the residue_density function will be applied. Note
        that if this is not a proteome object this function will throw an exception.

    residue_set : list of single-character strings representing amino acids
        This is a list of amino acids for which the local density will be computed. This could be a single 
        residue, or multiple residues. Duplicates are removed. Note that beyond removing duplicates and ensuring
        residues are upper case no sanity check is done here.

    name : string 
        The name defines the track name and allows the user to customize what they want to call this set of
        tracks. If safe is set to true (default) and a name is proposed that already and safe=True then 
        the function will throw an exception. If safe=False then an existing track will be overwritten.

    block_size : int (default = 30)
        This is the size of the subsequence over which density is calculated. 

    safe : bool (default = True)
        Boolean that defines if track names should be overwritten (if False) or throw an error (if True)
        when a track name is proposed that already exists in the proteome.


    Return 
    ----------

    No return value, but the proteome will have the set of tracks added to all proteins of sufficient
    length

    """

    # validate that a proteome was actually passed
    interface_tools.check_proteome(
        proteome, 'apply_track_residue_density (si_localcider)')

    # first remove any duplicates
    try:
        residue_set = list(set(residue_set))
    except Exception:
        raise InterfaceException(
            'Error when applying "apply_track_residue_density" from the localcider interface. Could not convert the residue_set to a set. residue_set should be a list/tuple/set of amino acid residues which will be used to compute local density'
        )

    # now for each protein
    for protein in proteome:

        # get the protein sequence and number of residues. If there are fewer residues in
        # the protein than the block size then we skip this protein.
        seq = protein.sequence
        nres = len(seq)
        if nres < block_size:
            continue

        # this is where we convert sequence into a local density by cycling through
        # each $block_size subsequence, computing the local density, and appending
        # that density to the ever-growing density vector
        density_vector = []
        for pos in range(0, (nres - block_size) + 1):

            total = 0
            subseq = seq[pos:pos + block_size]
            for i in residue_set:
                total = total + subseq.count(i.upper())

            density_vector.append(total / block_size)

        # this code creates leading/lagging values to fill in the missing ones such that the actual value
        # of the density vector reports on the density half-way across the blocksize and the
        # track length = nres
        leading_values = [density_vector[0]] * int(block_size / 2)
        lagging_values = [density_vector[-1]
                          ] * (nres -
                               (len(density_vector) + len(leading_values)))

        # this line then combines the leading, desnity and lagging lists into a single list, and we
        # then add this numerical list as a track
        final = leading_values + density_vector + lagging_values
        protein.add_track(name, values=final, safe=safe)
示例#5
0
def write_track(proteome,
                filename,
                track_name,
                value_fmt="%.3f",
                delimiter='\t'):
    """
    Function that writes out a specific track to file in a standardized format. Note that
    because track files are inevitably quite big default behaviour is to only write out a
    single track file at a time (i.e. unlike write_domains or write_sites where ALL domains
    or all sites are - by default - written out, here ONLY a single type of track, defined
    by track_name, can be written.

    To write ALL the tracks from a file, see si_tracks.write_all_tracks().
    
    Parameters
    -----------
    proteome :  Proteome object
        Proteome object from which the domains will be extracted from

    filename : str
        Filename that will be used to write the new domains file

    track_name : str
        Name of the track to be written out.

    value_fmt : str
        Format string that will be used for values. Default = "%.3f". Note that this is not
        a smart value so if the actual value used means that %.3f looses all meaning this will
        not trigger a warning, so, be careful!
        
    delimiter : str
        Character (or characters) used to separate between fields. Default is '\t'
        Which is recommended to maintain compliance with default `add_tracks_from_files()`
        function.
    
    Returns
    --------
    None
        No return type, but generates a new file with the complete set of domains
        from this proteome written to disk.

    """

    # test the passed value_fmt string works. This is not fullproof but at least validates that
    # the string can parse a float (which is a necessary requirement for tracks values to be read
    # back in again by shephard
    try:
        a = value_fmt % (1.5)

        if float(a) != 1.5:
            raise InterfaceException('Invalid value_fmt passed [%s]' %
                                     (str(value_fmt)))
    except TypeError:
        raise InterfaceException('Invalid value_fmt passed [%s]' %
                                 (str(value_fmt)))

    with open(filename, 'w') as fh:

        for protein in proteome:

            # try and extract out the track in question
            t = protein.track(track_name, safe=False)
            if t is not None:
                unique_ID = protein.unique_ID

                # build the initial string
                out_string = "%s%s%s%s" % (unique_ID, delimiter, t.name,
                                           delimiter)

                if t.values is not None:
                    for v in t.values:
                        out_string = out_string + "%s%s" % (value_fmt %
                                                            (v), delimiter)
                else:
                    for v in t.symbols:
                        out_string = out_string + "%s%s" % (v, delimiter)

                fh.write('%s\n' % (out_string))
示例#6
0
    def __init__(self, filename, delimiter='\t', mode='values', skip_bad=True):
        """
        
        Class for reading in correctly formatted tracks files for parsing into a
        Proteome object.

        Tracks files must adhere to the following specification

            unique_ID, track_name, val_1, val_2, ...., val_n 

        where n = length of protein.

        This class allows a tracksfile to be read in and defined as either a values
        track file, or a symbols track file, returning a tracks dictionary. 

        


        """

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2track = {}

        linecount = 0

        # cycle over every line in the file
        for line in content:

            linecount = linecount + 1

            # extract chop off lagging whitespace and divide up using the delimiter
            sline = line.strip().split(delimiter)
            track_data = []

            # for this list
            try:

                # extract track name and unique_id
                unique_ID = sline[0].strip()
                track_name = sline[1].strip()

                # parse track values or symbols
                if mode == 'values':

                    # for each element in sline strip whitespace and convert to a float
                    track_data = [float(i.strip()) for i in sline[2:]]

                elif mode == 'symbols':
                    # for each element in sline strip whitespace
                    track_data = [i.strip() for i in sline[2:]]
                else:
                    raise InterfaceException(
                        'Error: %s' %
                        "mode passed = %s, yet this does not match 'symbols' or 'values'"
                    )

                if unique_ID in ID2track:
                    ID2track[unique_ID].append({
                        'track_name': track_name,
                        'track_data': track_data
                    })
                else:
                    ID2track[unique_ID] = [{
                        'track_name': track_name,
                        'track_data': track_data
                    }]

            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

        self.data = ID2track
示例#7
0
    def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the following format:

        Unique_ID, name, sequence, key1:value1, key2:value2, ..., keyn:valuen

        NOTE that each unique_ID can ONLY appear once!

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2protein = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            sline = line.strip().split(delimiter)

            # try
            try:
                unique_ID = sline[0].strip()
                name = sline[1].strip()
                sequence = sline[2].strip()
                attributes = {}
            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 3:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[3:], filename, linecount, line)
            else:
                # skip over empty entries
                pass

            if unique_ID in ID2protein:
                raise InterfaceException(
                    "Duplicate protein found in the file %s (offending UID=%s). This cannot be skipped"
                    % (filename, UID))
            else:
                ID2protein[unique_ID] = {
                    'name': name,
                    'sequence': sequence,
                    'attributes': attributes
                }

        print(ID2protein)
        self.data = ID2protein
示例#8
0
    def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the following format:

        Unique_ID, start, stop, domain_type, key1:value1, key2:value2, ..., keyn:valuen

        Note that the first four arguments are required, while all of the key:value pairs 
        are optional. Key value must be separated by a ':', but any delimiter (other than ':') 
        is allowed.

        When created, this constructor parses the keyfile to generate a .data class object, 
        which itself maps a uniqueID to a list of domain dictionaries.

        Domain dictionaries have the following key-value pairs

        REQUIRED:
        start                : int (domain start position)
        end                  : int (domain end position)
        domain_type          : string (domain type)

        OPTIONAL:
        attributes           : dictionary of arbitrary key-value pairs 
                               that will be associated with the domain


        

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing domain file cannot use ":" as a delimiter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2domain = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            sline = line.strip().split(delimiter)

            try:
                unique_ID = sline[0].strip()
                start = int(sline[1].strip())
                end = int(sline[2].strip())
                domain_type = sline[3].strip()
                attributes = {}
            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # if we're skipping bad things then...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 4:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[4:], filename, linecount, line)

            if unique_ID in ID2domain:
                ID2domain[unique_ID].append({
                    'start': start,
                    'end': end,
                    'domain_type': domain_type,
                    'attributes': attributes
                })
            else:
                ID2domain[unique_ID] = [{
                    'start': start,
                    'end': end,
                    'domain_type': domain_type,
                    'attributes': attributes
                }]

        self.data = ID2domain
示例#9
0
def add_sites_from_dictionary(proteome,
                              sites_dictionary,
                              safe=True,
                              verbose=False):
    """
    Function that takes a correctly formatted Sites dictionary and will add those 
    sites to the proteins in the Proteome.

    Sites dictionaries are key-value pairs, where the key is a unique_ID associated 
    with a given protein, and the value is a list of dictionaries. Each subdirectionay has 
    the following elements

    'position' = site position
    'site_type' = site type
    'symbol' = site symbol 
    'value' = site value 
    'attributes' = site attribute dictionary

    In this way, each site that maps to a give unique_ID will be added to the associated
    protein.

    NOTE: In 

    Parameters
    -------------

    proteome : Proteome
        Proteome object to which we're adding sites. Note that ONLY sites for which a protein
        is found will be used. Protein-Site cross-referencing is done using the protein's unique_ID
        which should be the key used in the sites_dictionary

    sites_dictionary : dict
        A sites dictionary is a defined dictionary that maps a unique_ID back to a list of dictionaries,
        where each subdictionay has five elements. Each dictionary entry provides information on the site
        as a key-value pair, specifically:
    
        'position' = site position
        'site_type' = site type
        'symbol' = site symbol 
        'value' = site value 
        'attributes' = site attribute dictionary

        Recall the only type-specific values (position and value) are cast automatically when a 
        site is added by the Protein object, so no need to do that in this function too.

        Extra key-value paris in each sub-dictionary are ignored

    safe : boolean 
        If set to True then any exceptions raised during the site-adding process are acted
        on. If set to false, exceptions simply mean the site in question is skipped. There 
        are various reasons site addition could fail (notably position of the site is  
        outside of the protein limits) and so if verbose=True then the cause of an exception 
        is also  printed to screen. It is highly recommend that if you choose to
        use safe=False you also set verbose=True
        Default = True

    verbose : boolean
        Flag that defines how 'loud' output is. Will warn about errors on adding sites.

    Returns
    ---------
    None
        No return value, but adds all of the passed sites to the protein
    
    """

    for protein in proteome:
        if protein.unique_ID in sites_dictionary:
            for site in sites_dictionary[protein.unique_ID]:

                try:
                    position = site['position']
                    site_type = site['site_type']
                    symbol = site['symbol']
                    value = site['value']
                    try:
                        ad = site['attributes']
                    except:
                        ad = {}
                except Exception:
                    raise InterfaceException(
                        'When sites dictionary for key [%s] was unable to extract five distinct parametes. Entry is:\n%s\n'
                        % (protein.unique_ID, site))

                # assuming we can read all five params try and add the site
                try:
                    protein.add_site(position,
                                     site_type,
                                     symbol,
                                     value,
                                     attributes=ad)

                except ProteinException as e:
                    msg = '- skipping site %s at %i on %s' % (
                        site_type, position, protein)
                    if safe:
                        shephard_exceptions.print_and_raise_error(msg, e)
                    else:
                        if verbose:
                            shephard_exceptions.print_warning(msg)
                            continue
示例#10
0
    def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the following format:

        Unique_ID, position, site type, symbol, value, key1:value1, key2:value2, ..., keyn:valuen

        Note that the first four arguments are required, while all of the key:value pairs 
        are optional. Key value must be separated by a ':', but any delimiter (other than ':') 
        is allowed

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing site file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2site = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1
            sline = line.strip().split(delimiter)

            try:
                unique_ID = sline[0].strip()
                position = int(sline[1].strip())
                site_type = sline[2].strip()
                symbol = sline[3].strip()
                value = float(sline[4].strip())
                attributes = {}
            except Exception as e:
                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if there's more parse attribute dictionary entries
            if len(sline) > 5:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[5:], filename, linecount, line)

            if unique_ID in ID2site:
                ID2site[unique_ID].append({
                    'position': position,
                    'site_type': site_type,
                    'symbol': symbol,
                    'value': value,
                    'attributes': attributes
                })
            else:
                ID2site[unique_ID] = [{
                    'position': position,
                    'site_type': site_type,
                    'symbol': symbol,
                    'value': value,
                    'attributes': attributes
                }]

        self.data = ID2site