def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the followin format:

        Unique_ID, key1:value1, key2:value2, ..., keyn:valuen

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2ADs = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            sline = line.strip().split(delimiter)

            # try
            try:
                unique_ID = sline[0].strip()
                attributes = {}
            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 1:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[1:], filename, linecount, line)
            else:
                # skip over empty entries
                continue

            if unique_ID in ID2ADs:
                ID2ADs[unique_ID].append(attributes)
            else:
                ID2ADs[unique_ID] = [attributes]

        self.data = ID2ADs
def add_protein_attributes_from_dictionary(proteome,
                                           protein_attribute_dictionary,
                                           safe=True,
                                           verbose=True):
    """
    Function that takes a correctly formatted protein_atttribute dictionary and will add those 
    attributes to the proteins in the Proteome.

    protein attribute dictionaries are key-value pairs, where the key is a unique ID and the value
    is a list of dictionaries. For each sub-dictionary, the key-value pair reflects the attribute
    key-value pairing.

    Parameters
    ----------
    proteome : Proteome Object
        Proteome object to which attributes will be added

    protein_attribute_dictionary : dict
        Dictionary that defines protein attributes. This is slightly confusing, but the keys for this
        dictionary is a unique protein IDs and the values is a list of dictionaries. Each of THOSE sub
        dictionaries has one (or more) key:value pairs that define key:value pairs that will be associated
        with the protein of interest.

    safe : boolean 
        If set to True then any exceptions raised during the protein_attribute-adding process are acted
        on. If set to False, exceptions simply mean the protein_attribute in question is skipped. 
        Note if set to False, pre-existing protein_attributes with the same name would be silently 
        overwritten (although this is not consider an error), while overwriting will trigger an 
        exception in safe=True.
        
        The only reason protein attribute addition could fail is if the attribute already exists, so
        this is effectively a flag to define if pre-existing attributes should be overwritten (False) 
        or not (True).

        Default = True.
    
    verbose : boolean
        Flag that defines how 'loud' output is. Will warn about errors on adding attributes.

    Returns
    -----------
    None
        No return value, but attributes are added to proteins in the Proteome object passed as 
        the first argument
    
    """

    # check first argument is a Proteome
    interface_tools.check_proteome(
        proteome, 'add_protein_attributes (si_protein_attributes)')

    for protein in proteome:
        if protein.unique_ID in protein_attribute_dictionary:

            # note here each AD is its own dictionary
            for AD in protein_attribute_dictionary[protein.unique_ID]:

                # for each attribute-key
                for k in AD:

                    # get the value
                    v = AD[k]

                    try:
                        protein.add_attribute(k, v, safe=safe)
                    except ProteinException as e:
                        msg = '- skipping attribute entry on protein %s (key: %s) ' % (
                            protein.unique_ID, k)
                        if safe:
                            shephard_exceptions.print_and_raise_error(msg, e)
                        else:
                            if verbose:
                                shephard_exceptions.print_warning(msg)
                                continue
Пример #3
0
def add_tracks_from_dictionary(proteome,
                               tracks_dictionary,
                               mode,
                               safe=True,
                               verbose=True):
    """

    Function that takes a correctly formatted tracks dictionary and will add those tracks to 
    the proteins in the Proteome.
    

    track dictionaries are key-value pairs, where the key is a unique ID and the value
    is a list of dictionaries. For each sub-dictionary, there are two key-value pairs that
    reflect:

        'track_name'  : name of the track (str)
        'track_data' : parsed list of floats (if expecting values) or strings (if expecting symbols)
                        that should equal the length of the associated protein.


    Parameters
    ----------

    proteome : Proteome Object
        Proteome object which tracks will be added to

    tracks_dictionary : dict
        Dictionary in which keys are unique IDs for proteins and the value is a list of dictionaries,
        where each subdictionary has the two key-value pairs:

        'track_name'  : name of the track (str)
        'track_data' : parsed list of floats (if expecting values) or strings (if expecting symbols)
                        that should equal the length of the associated protein.
    
    mode : string {'symbols','values'}
       A selector that defines the type of track file to be read. Must be either 'symbols' or 
       'values'

    safe : bool (default = True)
        If set to True then any exceptions raised during the track-adding process are acted
        on. If set to False, exceptions simply mean the Track in question is skipped. 
        Note if set to False, pre-existing Tracks with the same name would be silently overwritten (although 
        this is not consider an error), while overwriting will trigger an exception in safe=True
        There are various reasons Track addition could fail (length does not match the protein etc) 
        and so if verbose=True then the cause of an exception is also printed to 
        screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. 
        Default = True.

    verbose : boolean
        Flag that defines how 'loud' output is. Will warn about errors on adding tracks.
        
    Returns
    -----------
    None
        No return value, but tracks are added to the Proteome object passed as the first argument

        
    """

    # check first argument is a proteome
    interface_tools.check_proteome(proteome,
                                   'add_tracks_from_dictionary (si_tracks)')

    # check mode is valid
    general_utilities.valid_keyword('mode', mode, ['symbols', 'values'])

    # cycle through each protein in the proteome...
    for protein in proteome:
        if protein.unique_ID in tracks_dictionary:
            for track in tracks_dictionary[protein.unique_ID]:

                # get the track name and vector info
                track_name = track['track_name']
                track_data = track['track_data']

                # add the track as either values or symbols depending
                # on what was provided
                try:
                    if mode == 'values':
                        protein.add_track(track_name,
                                          values=track_data,
                                          safe=safe)
                    else:
                        protein.add_track(track_name,
                                          symbols=track_data,
                                          safe=safe)

                # if an ProteinException was raised when trying to add a track some
                # anticipated error occurred
                except (ProteinException, TrackException) as e:

                    msg = '- skipping track at %s on %s' % (track_name,
                                                            protein)
                    if safe:
                        shephard_exceptions.print_and_raise_error(msg, e)
                    else:
                        if verbose:
                            shephard_exceptions.print_warning(msg)
                        continue
Пример #4
0
    def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the following format:

        Unique_ID, name, sequence, key1:value1, key2:value2, ..., keyn:valuen

        NOTE that each unique_ID can ONLY appear once!

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2protein = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            sline = line.strip().split(delimiter)

            # try
            try:
                unique_ID = sline[0].strip()
                name = sline[1].strip()
                sequence = sline[2].strip()
                attributes = {}
            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 3:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[3:], filename, linecount, line)
            else:
                # skip over empty entries
                pass

            if unique_ID in ID2protein:
                raise InterfaceException(
                    "Duplicate protein found in the file %s (offending UID=%s). This cannot be skipped"
                    % (filename, UID))
            else:
                ID2protein[unique_ID] = {
                    'name': name,
                    'sequence': sequence,
                    'attributes': attributes
                }

        print(ID2protein)
        self.data = ID2protein
Пример #5
0
    def __init__(self, filename, delimiter='\t', mode='values', skip_bad=True):
        """
        
        Class for reading in correctly formatted tracks files for parsing into a
        Proteome object.

        Tracks files must adhere to the following specification

            unique_ID, track_name, val_1, val_2, ...., val_n 

        where n = length of protein.

        This class allows a tracksfile to be read in and defined as either a values
        track file, or a symbols track file, returning a tracks dictionary. 

        


        """

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2track = {}

        linecount = 0

        # cycle over every line in the file
        for line in content:

            linecount = linecount + 1

            # extract chop off lagging whitespace and divide up using the delimiter
            sline = line.strip().split(delimiter)
            track_data = []

            # for this list
            try:

                # extract track name and unique_id
                unique_ID = sline[0].strip()
                track_name = sline[1].strip()

                # parse track values or symbols
                if mode == 'values':

                    # for each element in sline strip whitespace and convert to a float
                    track_data = [float(i.strip()) for i in sline[2:]]

                elif mode == 'symbols':
                    # for each element in sline strip whitespace
                    track_data = [i.strip() for i in sline[2:]]
                else:
                    raise InterfaceException(
                        'Error: %s' %
                        "mode passed = %s, yet this does not match 'symbols' or 'values'"
                    )

                if unique_ID in ID2track:
                    ID2track[unique_ID].append({
                        'track_name': track_name,
                        'track_data': track_data
                    })
                else:
                    ID2track[unique_ID] = [{
                        'track_name': track_name,
                        'track_data': track_data
                    }]

            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

        self.data = ID2track
Пример #6
0
def add_proteins_from_dictionary(proteome,
                                 protein_dictionary,
                                 safe=True,
                                 verbose=True):
    """
    Function that takes a correctly formatted protein dictionary and will add those 
    proteins to the Proteome.

    protein dictionaries are key-value pairs, where the key is a unique ID and the value
    is itself a dictionary which has the following keys

    'name' = Protein name (uncontrolled vocabulary, but should be a string)
    'sequence' = Amino acid sequence for the protein (note that no sanity checking is done here)
    'attributes' = dictionary of arbitrary key:value pairings (optional)


    Parameters
    ----------
    proteome : Proteome Object
        Proteome object to which attributes will be added

    protein_dictionary : dict
        Dictionary that defines proteins. The keys for this dictionary is a unique protein IDs 
        and the values is a list of dictionaries. Each of THOSE sub dictionaries contains key-value
        pairs are described above

    safe : boolean 
        If set to True then any exceptions raised during the protein-adding process are acted
        on. If set to False, exceptions simply mean the protein_attribute in question is skipped. 
        Note if set to False, pre-existing protein_attributes with the same name would be silently 
        overwritten (although this is not consider an error), while overwriting will trigger an 
        exception in safe=True.
       
        The only reason protein attribute addition could fail is if the attribute already exists, so
        this is effectively a flag to define if pre-existing attributes should be overwritten (False) 
        or not (True).

        Default = True.
    
    verbose : boolean
        Flag that defines how 'loud' output is. Will warn about errors on adding attributes.

    Returns
    -----------
    None
        No return value, but attributes are added to proteins in the Proteome object passed as 
        the first argument
    
    """

    # check first argument is a Proteome
    interface_tools.check_proteome(
        proteome, 'add_protein_from_dictionary (si_proteins)')

    if safe is False:
        force_overwrite = True
    else:
        force_overwrite = False

    # for each entry in the overall dictionary
    for UID in protein_dictionary:

        # if attributes are included read these out
        try:
            ats = protein_dictionary[UID]['attributes']
        except:
            ats = None

        s = protein_dictionary[UID]['sequence']

        # note we use the clean_string to remove
        n = protein_dictionary[UID]['name']

        try:
            proteome.add_protein(s,
                                 n,
                                 UID,
                                 attributes=ats,
                                 force_overwrite=force_overwrite)
        except (ProteinException, ProteomeException) as e:
            msg = '- skipping protein %s (name = %s, len=%i' % (UID, n, len(s))
            if safe:
                shephard_exceptions.print_and_raise_error(msg, e)
            else:
                if verbose:
                    shephard_exceptions.print_warning(msg)
                    continue
Пример #7
0
    def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the following format:

        Unique_ID, start, stop, domain_type, key1:value1, key2:value2, ..., keyn:valuen

        Note that the first four arguments are required, while all of the key:value pairs 
        are optional. Key value must be separated by a ':', but any delimiter (other than ':') 
        is allowed.

        When created, this constructor parses the keyfile to generate a .data class object, 
        which itself maps a uniqueID to a list of domain dictionaries.

        Domain dictionaries have the following key-value pairs

        REQUIRED:
        start                : int (domain start position)
        end                  : int (domain end position)
        domain_type          : string (domain type)

        OPTIONAL:
        attributes           : dictionary of arbitrary key-value pairs 
                               that will be associated with the domain


        

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing domain file cannot use ":" as a delimiter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2domain = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            sline = line.strip().split(delimiter)

            try:
                unique_ID = sline[0].strip()
                start = int(sline[1].strip())
                end = int(sline[2].strip())
                domain_type = sline[3].strip()
                attributes = {}
            except Exception as e:

                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # if we're skipping bad things then...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 4:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[4:], filename, linecount, line)

            if unique_ID in ID2domain:
                ID2domain[unique_ID].append({
                    'start': start,
                    'end': end,
                    'domain_type': domain_type,
                    'attributes': attributes
                })
            else:
                ID2domain[unique_ID] = [{
                    'start': start,
                    'end': end,
                    'domain_type': domain_type,
                    'attributes': attributes
                }]

        self.data = ID2domain
Пример #8
0
def add_domains_from_dictionary(proteome,
                                domain_dictionary,
                                autoname=False,
                                safe=True,
                                verbose=True):
    """
    Function that takes a correctly formatted Domains dictionary and will add those 
    domains to the proteins in the Proteome.

    Domains dictionaries are key-value pairs, where the key is a unique_ID associated 
    with a given protein, and the value is a list of dictionaries. Each subdictionary has 
    four key-value pairs:

    'start' = start position (int showing start of the domain, starting at 1)
    'end' = end position (int showing end of the domain, inclusive)
    'domain_type' = domain type (string that names the domain)
    'attributes' = dictionary of arbitrary key:value pairings (optional)

    The start and end positions should be locations within the sequence defined by the unique_ID, 
    and if they are out of the sequence bounds this will throw an exception. Domain type is a string
    that names the type of domain. The attributes dictionary is an arbitrary key-value pair dictionary 
    where key-values map an arbitrary key to an arbitrary value (read in as strings).

    In this way, each domain that maps to a give unique_ID will be added. Note the attribute is
    optional.

    Parameters
    ----------
    proteome : Proteome object
        Proteome object to which domains will be added

    domain_dictionary : dict
        Dictionary that maps unique_IDs to a list of one or more domain dictionaries

    autoname : bool
        If autoname is set to true, this function ensures each domain ALWAYS has a unique
        name - i.e. the allows for multiple domains to be perfecly overlapping in position
        and type. This is generally not going to be required and/or make sense, but having
        this feature in place is useful. In general we want to avoid this as it makes it 
        easy to include duplicates which by default are prevented when autoname = False. 
        Default = False.
    
    safe : bool
        If set to True then any exceptions raised during the Domain-adding process are acted
        on. If set to False, exceptions simply mean the domain in question is skipped. 
        Note if set to False, pre-existing Domains with the same name would be silently overwritten (although 
        this is not consider an error), while overwriting will trigger an exception in safe=True
        There are various reasons Domain addition could fail (start/end position outside of the 
        protein limits etc.) and so if verbose=True then the cause of an exception is also printed to 
        screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. 
        Default = True.
    
    verbose : bool
        Flag that defines how 'loud' output is. Will warn about errors on adding domains.

    Returns
    -----------
    None
        No return value, but domains are added to the Proteome object passed as the first argument
    
    """
    # Note - the safe keyword is actually dealt with in this function in conjunction with the Verbose
    # keyword, so we pass safe=False to the add_domain function and then catch the exception in this
    # function.

    # check first argument is a proteome
    interface_tools.check_proteome(proteome, 'add_domains (si_domains)')

    for protein in proteome:
        if protein.unique_ID in domain_dictionary:
            for domain in domain_dictionary[protein.unique_ID]:

                start = domain['start']
                end = domain['end']
                domain_type = domain['domain_type']

                try:
                    ad = domain['attributes']
                except:
                    ad = {}

                # try and add the domain...
                try:
                    protein.add_domain(start,
                                       end,
                                       domain_type,
                                       attributes=ad,
                                       safe=safe,
                                       autoname=autoname)
                except (ProteinException, DomainException) as e:

                    msg = '- skipping domain at %i-%i on %s' % (start, end,
                                                                protein)
                    if safe:
                        shephard_exceptions.print_and_raise_error(msg, e)
                    else:
                        if verbose:
                            shephard_exceptions.print_warning(msg)
                            continue
Пример #9
0
def add_sites_from_dictionary(proteome,
                              sites_dictionary,
                              safe=True,
                              verbose=False):
    """
    Function that takes a correctly formatted Sites dictionary and will add those 
    sites to the proteins in the Proteome.

    Sites dictionaries are key-value pairs, where the key is a unique_ID associated 
    with a given protein, and the value is a list of dictionaries. Each subdirectionay has 
    the following elements

    'position' = site position
    'site_type' = site type
    'symbol' = site symbol 
    'value' = site value 
    'attributes' = site attribute dictionary

    In this way, each site that maps to a give unique_ID will be added to the associated
    protein.

    NOTE: In 

    Parameters
    -------------

    proteome : Proteome
        Proteome object to which we're adding sites. Note that ONLY sites for which a protein
        is found will be used. Protein-Site cross-referencing is done using the protein's unique_ID
        which should be the key used in the sites_dictionary

    sites_dictionary : dict
        A sites dictionary is a defined dictionary that maps a unique_ID back to a list of dictionaries,
        where each subdictionay has five elements. Each dictionary entry provides information on the site
        as a key-value pair, specifically:
    
        'position' = site position
        'site_type' = site type
        'symbol' = site symbol 
        'value' = site value 
        'attributes' = site attribute dictionary

        Recall the only type-specific values (position and value) are cast automatically when a 
        site is added by the Protein object, so no need to do that in this function too.

        Extra key-value paris in each sub-dictionary are ignored

    safe : boolean 
        If set to True then any exceptions raised during the site-adding process are acted
        on. If set to false, exceptions simply mean the site in question is skipped. There 
        are various reasons site addition could fail (notably position of the site is  
        outside of the protein limits) and so if verbose=True then the cause of an exception 
        is also  printed to screen. It is highly recommend that if you choose to
        use safe=False you also set verbose=True
        Default = True

    verbose : boolean
        Flag that defines how 'loud' output is. Will warn about errors on adding sites.

    Returns
    ---------
    None
        No return value, but adds all of the passed sites to the protein
    
    """

    for protein in proteome:
        if protein.unique_ID in sites_dictionary:
            for site in sites_dictionary[protein.unique_ID]:

                try:
                    position = site['position']
                    site_type = site['site_type']
                    symbol = site['symbol']
                    value = site['value']
                    try:
                        ad = site['attributes']
                    except:
                        ad = {}
                except Exception:
                    raise InterfaceException(
                        'When sites dictionary for key [%s] was unable to extract five distinct parametes. Entry is:\n%s\n'
                        % (protein.unique_ID, site))

                # assuming we can read all five params try and add the site
                try:
                    protein.add_site(position,
                                     site_type,
                                     symbol,
                                     value,
                                     attributes=ad)

                except ProteinException as e:
                    msg = '- skipping site %s at %i on %s' % (
                        site_type, position, protein)
                    if safe:
                        shephard_exceptions.print_and_raise_error(msg, e)
                    else:
                        if verbose:
                            shephard_exceptions.print_warning(msg)
                            continue
Пример #10
0
    def __init__(self, filename, delimiter='\t', skip_bad=True):
        """
        Expect files of the following format:

        Unique_ID, position, site type, symbol, value, key1:value1, key2:value2, ..., keyn:valuen

        Note that the first four arguments are required, while all of the key:value pairs 
        are optional. Key value must be separated by a ':', but any delimiter (other than ':') 
        is allowed

        """

        if delimiter == ':':
            raise InterfaceException(
                'When parsing site file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)'
            )

        with open(filename, 'r') as fh:
            content = fh.readlines()

        ID2site = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1
            sline = line.strip().split(delimiter)

            try:
                unique_ID = sline[0].strip()
                position = int(sline[1].strip())
                site_type = sline[2].strip()
                symbol = sline[3].strip()
                value = float(sline[4].strip())
                attributes = {}
            except Exception as e:
                msg = 'Failed parsing file [%s] on line [%i].\n\nException raised: %s\n\nline printed below:\n%s' % (
                    filename, linecount, str(e), line)

                # should update this to also display the actual error...
                if skip_bad:
                    shephard_exceptions.print_warning(
                        msg + "\nSkipping this line...")
                    continue
                else:
                    raise InterfaceException(msg)

            # if there's more parse attribute dictionary entries
            if len(sline) > 5:
                attributes = interface_tools.parse_key_value_pairs(
                    sline[5:], filename, linecount, line)

            if unique_ID in ID2site:
                ID2site[unique_ID].append({
                    'position': position,
                    'site_type': site_type,
                    'symbol': symbol,
                    'value': value,
                    'attributes': attributes
                })
            else:
                ID2site[unique_ID] = [{
                    'position': position,
                    'site_type': site_type,
                    'symbol': symbol,
                    'value': value,
                    'attributes': attributes
                }]

        self.data = ID2site