示例#1
0
    def __init__(self, organism='H**o sapiens', verbose=True, cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.
        :param str verbose: a verbose level in ERROR/DEBUG/INFO/WARNING
            compatible with those used in BioServices.

        """
        super(Complexes, self).__init__(level=verbose)

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search('*', frmt='pandas')
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df['organismName']))
        self.valid_organisms = [x.split(';')[0] for x in self.valid_organisms]


        #: list of valid organisms found in the database
        self.organisms = list(set(df['organismName']))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None
示例#2
0
    def __init__(self, organism="H**o sapiens", cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.

        """
        self.logging = Logging()

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search("*", frmt="pandas")
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df["organismName"]))
        self.valid_organisms = [x.split(";")[0] for x in self.valid_organisms]

        #: list of valid organisms found in the database
        self.organisms = list(set(df["organismName"]))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None
示例#3
0
class Complexes(Logging):
    """Manipulate complexes of Proteins

    This class uses Intact Complex database to extract information about
    complexes of proteins. 

    When creating an instance, the default organism is "H**o sapiens". 
    The organism can be set to another one during the instanciation or later::

        >>> from biokit.network.complexes import Complexes
        >>> c = Complexes(organism='H**o sapiens')
        >>> c.organism = 'Rattus norvegicus'

    Valid organisms can be found in :attr:`organisms`. When changing the
    organism, a request to the Intact database is sent, which may take some
    time to update. Once done, information related to  this organism is stored
    in the :attr:`df` attribute, which is a Pandas dataframe. It 
    contains 4 columns. Here is for example one row::

        complexAC                                             EBI-2660609
        complexName                            COP9 signalosome variant 1
        description     Essential regulator of the ubiquitin (Ubl) con...
        organismName                                   H**o sapiens; 9606

    This is basic information but once a complex accession (e.g., EBI-2660609)
    is known, you can retrieve detailled information. This is done
    automatically for all the accession when needed. The first time, it will 
    take a while (20 seconds for 250 accession) but will be cache for this 
    instance. 

    The :attr:`complexes` contains all details about the entries found in
    :attr:`df`. It is a dictionary where keys are the complex accession. For 
    instance::

        >>> c.complexes['EBI-2660609']

    In general, one is interested in the participants of the complex, that is
    the proteins that form the complex. Another attribute is set for you::

        >>> c.participants['EBI-2660609']

    Finally, you may even want to obtain just the identifier of the participants
    for each complex. This is stored in the :attr:`identifiers`::

        >>> c.identifiers['EBI-2660609']

    Note however, that the identifiers are not neceseraly uniprot identifiers.
    Could be ChEBI or sometimes even set to None. The :meth:`strict_filter`
    removes the complexes with less than 2 (strictly) uniprot identifiers.
    
    Some basic statistics can be printed with :meth:`stats` that indeticates
    the number of complexes, number of identifiers in those complexes ,and
    number of unique identifiers. A histogram of number of appearance of each 
    identifier is also shown.

    The :meth:`hist_participants` shows the number of participants per complex.

    Finally, the meth:`search_complexes` can be used in the context of 
    logic modelling to infer the AND gates from a list of uniprot identifiers
    provided by the user. See :meth:`search_complexes` for details.

    Access to the Intact Complex database is performed using the 
    package BioServices provided in Pypi.
    """
    def __init__(self, organism='H**o sapiens', verbose=True, cache=False):
        """.. rubric:: Constructor

        :param str orgamism: the organism to look at. H**o sapiens
            is the default. Other possible organisms can be found
            in :attr:`organisms`.
        :param str verbose: a verbose level in ERROR/DEBUG/INFO/WARNING
            compatible with those used in BioServices.

        """
        super(Complexes, self).__init__(level=verbose)

        self.devtools = DevTools()
        self.webserv = IntactComplex(verbose=verbose, cache=cache)
        df = self.webserv.search('*', frmt='pandas')
        self.df = df

        #: list of valid organisms found in the database
        self.valid_organisms = list(set(df['organismName']))
        self.valid_organisms = [x.split(';')[0] for x in self.valid_organisms]


        #: list of valid organisms found in the database
        self.organisms = list(set(df['organismName']))
        self._organism = None
        if organism in self.organisms:
            self.organism = organism
        else:
            print("Organism not set yet. ")

        # This will populated on request as a cache/buffer
        self._details = None
        self._complexes = None

    def _get_organism(self):
        return self._organism
    def _set_organism(self, organism):
        self.devtools.check_param_in_list(organism, [str(x.split(";")[0]) for x in self.valid_organisms])
        self._organism = organism

        self.df = self.webserv.search('*', frmt='pandas',
                filters='species_f:("%s")' % self.organism)
        self._complexes = None
    organism = property(_get_organism, _set_organism, 
        doc="Getter/Setter of the organism")

    def hist_participants(self):
        """Histogram of the number of participants per complexes

        :return: a dictionary with complex identifiers as keys and
            number of participants as values

        ::

            from biokit.network.complexes import Complexes
            c = Complexes()
            c.hist_participants()

        """
        N = []
        count = {}
        for i, identifier in enumerate(self.complexes.keys()):
            n = len(self.complexes[identifier]['participants'])
            N.append(n)
            count[identifier] = n

        _ = pylab.hist(N, bins=range(0, max(N)))
        pylab.title('Number of participants per complex')
        pylab.grid()
        return count

    def stats(self):
        """Prints some stats about the number of complexes and histogram of the
        number of appearances of each species"""
        species = []
        for k in self.participants.keys():
            species.extend([x['identifier'] for x in self.participants[k]])
            N = []
        for spec in set(species):
            N.append(species.count(spec))
        _ = pylab.hist(N, bins=range(0, max(N)))
        pylab.title("Number of appaerances of each species")
        pylab.grid()
        print("""There are %s complexes involving %s participants with %s unique species. """ %
                 (len(self.complexes), len(species), len(set(species))))

    def _get_participants(self):
        participants = {}
        for k,v in self.complexes.items():
            participants[k] = v['participants']
        return participants
    participants = property(_get_participants, 
        doc="""Getter of the complex participants (full details)""")

    def _get_identifiers(self):
        identifiers = {}
        for k,v in self.participants.items():
            identifiers[k] = [x['identifier'] for x in v]
        return identifiers
    identifiers = property(_get_identifiers,
            doc="""Getter of the identifiers of the complex participants""")

    def _get_complexes(self):
        if self._complexes is None:
            self._load_complexes()
        return self._complexes.copy()
    complexes = property(_get_complexes,
        doc="""Getter of the complexes (full details""")

    def _load_complexes(self, show_progress=True):
        from easydev import Progress
        import time
        pb = Progress(len(self.df.complexAC))
        complexes = {}
        self.logging.info("Loading all details from the IntactComplex database")
        for i, identifier in enumerate(self.df.complexAC):
            res = self.webserv.details(identifier)
            complexes[identifier] = res
            if show_progress:
                pb.animate(i+1)
        self._complexes = complexes

    def remove_homodimers(self):
        """Remove identifiers that are None or starts with CHEBI
        and keep complexes that have at least 2 participants
        
        
        :return: list of complex identifiers that have been removed.
        """


        # None are actually h**o dimers
        toremove = []
        for k,this in self.identifiers.items():
            remains = [x for x in this if x is not None]
            if len(remains)<=1:
                toremove.append(k)
        self.logging.info("removing %s homodimers complexes" % len(toremove))
        for this in toremove:
            del self._complexes[this]
        return toremove

    def search_complexes(self, user_species, verbose=False):
        """Given a list of uniprot identifiers, return complexes and
            possible complexes.

        :param list user_species: list of uniprot identifiers to be
            found in the complexes
        :return: two dictionaries. First one contains the complexes
            for which all participants have been found in the user_species
            list. The second one contains complexes for which some participants
            (not all) have been found in the user_species list.

        """
        level = self.debugLevel[:]
        if verbose:
            self.debugLevel = 'INFO'
        else:
            self.debugLevel = 'ERROR'

        and_gates = {}
        candidates = {}

        identifiers = self.identifiers.values()

        for k, identifiers in self.identifiers.items():

            # get rid of suffixes such as -1 or -PRO_xxx
            prefixes = [x.split("-")[0] if x is not None else x for x in identifiers]


            # You may have a complex with ['P12222', 'P33333-PRO1',
            # 'P33333-PRO2'], in which case P33333 is found only once and
            # thereofre the final number of found participants is not the length
            # of the complexes...so we need to get rid of the duplicates if any
            prefixes = list(set(prefixes))
            N = len(prefixes)
            found = [spec for spec in user_species if spec in prefixes]

            if len(found) == N:
                self.logging.info('Found entire complex %s ' % k)
                and_gates[k] = identifiers[:]
            elif len(found) >= 1:
                self.logging.info('Found partial complex %s with %s participants out of %s' % 
                        (k, len(found), len(identifiers)))
                candidates[k] = {'participants': identifiers, 'found': found}
        self.debugLevel = level[:]
        return and_gates, candidates

    def search(self, name):
        """Search for a unique identifier (e.g. uniprot) in all complexes

        :return: list of complex identifiers where the name was found
        """
        found = []
        for k, identifiers in self.identifiers.items():
            prefixes = [x.split("-")[0] if x is not None else x for x in identifiers ]
            if name in prefixes:
                self.logging.info("Found %s in complex %s (%s)" % (name, k,
                    identifiers))
                found.append(k)
        return found

    def chebi2name(self, name):
        """Return the ASCII name of a CHEBI identifier"""
        from bioservices import ChEBI
        c = ChEBI()
        name = dict(c.getLiteEntity(name)[0])['chebiAsciiName']
        return name

    def uniprot2genename(self, name):
        """Return the gene names of a UniProt identifier"""
        from bioservices import UniProt
        c = UniProt(cache=True)

        try:
            res = pd.read_csv(StringIO(c.search(name, limit=1)), sep='\t')
            return list(res['Gene names'].values)
        except:
            print("Could not find %s" % name)

    def report(self, species):
        complete, partial = self.search_complexes(species, verbose=False)
        res = {'Found':[], 'Participants':[], 'Complete':[],
                'Identifier':[], 'Number found':[], 'Number of participants':[],
                'Name':[]}

        for k, v in complete.items():
            res['Name'].append(self.complexes[k]['name'])
            res['Found'].append(";".join(v))
            res['Number found'].append(len(v))
            res['Participants'].append(";".join(self.identifiers[k]))
            res['Number of participants'].append(len(self.identifiers[k]))
            res['Complete'].append(True)
            res['Identifier'].append(k)

        for k, v in partial.items():
            res['Name'].append(self.complexes[k]['name'])
            res['Found'].append(";".join(v['found']))
            res['Number found'].append(len(v['found']))
            res['Participants'].append(";".join(self.identifiers[k]))
            res['Number of participants'].append(len(self.identifiers[k]))
            res['Complete'].append(False)
            res['Identifier'].append(k)

        res = pd.DataFrame(res, columns=['Found', 'Participants', 'Identifier', 'Name', 'Number found', 
            'Number of participants', 'Complete'])
        return res
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 11 00:40:21 2017

@author: moghb
"""
import json
import requests
import logging
from bioservices import IntactComplex
import os
your_project_directory="C:\\Users\\moghb\\OneDrive\\Documents\\Database\\expert_schema"

s = IntactComplex()
r=s.search('nup53')['elements'][0]['complexAC']

logger = logging.getLogger(__name__)
url ="https://www.ebi.ac.uk/intact/complex-ws/details/"+r+"?format=json"

def do_request_complexdb( ):
    resp = requests.get(url)
    logger.debug('Requested {}'.format(resp.url))

    if resp.status_code == 200:
        return resp
    else:
        raise Exception(resp)

j = json.loads(do_request_complexdb().text)

with open(os.path.join(your_project_directory, 'data.txt'), 'w') as outfile: