示例#1
0
def kegg_to_symbol_through_uniprot(unknown_genes):
    # create string to call uniprot for mapping
    search_string = '\t'.join(unknown_genes)
    kegg_to_gene_name = dict()
    missing = set()
    uniprot = UniProt(verbose=True)
    # This is where it gets tricky. Checking to see if there is a uniprot
    # mapping for the species, if not, trying from KEGG side. Sometimes
    # kegg  links to a different uniprot, or uniprot links to a diff kegg.
    uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
    for i in unknown_genes:
        if i in uni_dict:
            for n in uni_dict[i]:
                x = uniprot.search("accession:{}".format(n),
                                   columns='genes(PREFERRED),reviewed,id',
                                   limit=1)
                _, data = x.rstrip('\n').split('\n')
                name, review, entry = data.split('\t')
                if n != entry:
                    print(i, n, entry, x, "dont match")
                elif review == 'reviewed':
                    kegg_to_gene_name[i] = name

        else:
            missing.add(i)
    print("{} mappings not found from kegg to"
          " gene name".format(len(missing)))
    print(missing)
    return kegg_to_gene_name
示例#2
0
def kegg_to_symbol_through_uniprot(unknown_genes):
    # create string to call uniprot for mapping
    search_string = '\t'.join(unknown_genes)
    kegg_to_gene_name = dict()
    missing = set()
    uniprot = UniProt(verbose=True)
    # This is where it gets tricky. Checking to see if there is a uniprot
    # mapping for the species, if not, trying from KEGG side. Sometimes
    # kegg  links to a different uniprot, or uniprot links to a diff kegg.
    uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
    for i in unknown_genes:
        if i in uni_dict:
            for n in uni_dict[i]:
                x = uniprot.search("accession:{}".format(n),
                                   columns='genes(PREFERRED),reviewed,id',
                                   limit=1)
                header, data = x.rstrip('\n').split('\n')
                name, review, entry = data.split('\t')
                if n != entry:
                    print(i, n, entry, x, "dont match")
                elif review == 'reviewed':
                    kegg_to_gene_name[i] = name

        else:
            missing.add(i)
    print("{} mappings not found from kegg to"
          " gene name".format(len(missing)))
    print(missing)
    return kegg_to_gene_name
示例#3
0
    def test_extract_protein_interactions_kgml(self, kgml_file,
                                               expected_no_rel):
        # Arrange
        sut = KeggProteinInteractionsExtractor()
        with open(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             kgml_file), 'r') as myfile:
            kgml_string = myfile.read()

        # Mock Kegg ops
        mock_kegg = KEGG()
        sut.kegg = mock_kegg

        # No matter what the input is, return the  ko numbers that map to hsa numbers
        mock_kegg.link = MagicMock(return_value="ko:K00922	hsa:5293\n" +
                                   "ko:K00922	hsa:5291\n" +
                                   "ko:K02649	hsa:5295")

        # No matter what the input is, return the  hsa numbers that map to uniprot numbers
        mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"})

        # Mock Uni Prot
        mock_uniprot = UniProt()
        sut.uniprot = mock_uniprot
        mock_uniprot.mapping = MagicMock(
            return_value={"B0LPE5": ["gene1", "gene2"]})

        # Act
        actual = sut.extract_protein_interactions_kgml(kgml_string)

        # Assert
        self.assertEqual(expected_no_rel, len(actual))
示例#4
0
def find_gene(prot_id):
    u = UniProt(verbose=False)
    res = u.mapping("EMBL", "ACC", query=prot_id)
    for key, values in res.items():
        for value in values:
            res = u.search(value, frmt="tab", limit=3, columns="genes", database='uniparc')

            genes = set(res[11:].split(';'))
            genes = [i for i in genes if (0<len(i) and i !='\n')]

            if len(genes)<1:
                genes = 'none'

            return key, genes
    return prot_id, 'none'
示例#5
0
class PSICQUIC(REST):
    """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service

    There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used
    the REST only.


    This service provides a common interface to more than 25 other services
    related to protein. So, we won't detail all the possiblity of this service.
    Here is an example that consists of looking for interactors of the
    protein ZAP70 within the IntAct database::

        >>> from bioservices import *
        >>> s = PSICQUIC()
        >>> res = s.query("intact", "zap70")
        >>> len(res) # there are 11 interactions found
        11
        >>> for x in res[1]:
        ...     print(x)
        uniprotkb:O95169
        uniprotkb:P43403
        intact:EBI-716238
        intact:EBI-1211276
        psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI
        .
        .

    Here we have a list of entries. There are 15 of them (depending on
    the *output* parameter). The meaning of the entries is described on PSICQUIC
    website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short:


    #. Unique identifier for interactor A
    #. Unique identifier for interactor B.
    #. Alternative identifier for interactor A, for example the official gene
    #. Alternative identifier for interactor B.
    #. Aliases for A, separated by "|
    #. Aliases for B.
    #. Interaction detection methods, taken from the corresponding PSI-MI
    #. First author surname(s) of the publication(s)
    #. Identifier of the publication
    #. NCBI Taxonomy identifier for interactor A.
    #. NCBI Taxonomy identifier for interactor B.
    #. Interaction types,
    #. Source databases and identifiers,
    #. Interaction identifier(s) i
    #. Confidence score. Denoted as scoreType:value.



    Another example with reactome database::

        res = s.query("reactome", "Q9Y266")


    .. warning:: PSICQUIC gives access to 25 other services. We cannot create
        a dedicated parsing for all of them. So, the ::`query` method returns
        the raw data. Addition class may provide dedicated parsing in the
        future.

    .. seealso:: :class:`bioservices.biogrid.BioGRID`
    """

    _formats = ["tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml",
        "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"]


    # note the typo in "genbank indentifier from bind DB
    _mapping_uniprot = {"genbank indentifier": "P_GI",
        'entrezgene/locuslink':"P_ENTREZGENEID",
        'uniprotkb': "ACC+ID",
        'rcsb pdb':"PDB_ID",
        'ensembl':"ENSEMBL_ID",
        'refseq':"P_REFSEQ_AC",
        'hgnc':'HGNC_ID',
        "kegg": "KEGG_ID",
        "entrez gene/locuslink": "P_ENTREZGENEID",
        "chembl": "CHEMBL_ID",
        "ddbj/embl/genbank": "EMBL_ID",
        "dip": "DIP_ID",
        "ensemblgenomes": "ENSEMBLGENOME_ID",
        "omim":"MIM_ID",
        "chebi": None,
        "chembl": None,
        #        "intact": None
        }

    # unknown: hprd, omim, bind, bind complexid, mdl,

    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest::

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic'
        super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr)
        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.logging.warning("UniProt service could be be initialised")
        self.buffer = {}

    def _get_formats(self):
        return PSICQUIC._formats
    formats = property(_get_formats, doc="Returns the possible output formats")

    def _get_active_db(self):
        names = self.registry_names[:]
        actives = self.registry_actives[:]
        names = [x.lower() for x,y in zip(names, actives) if y=="true"]
        return names
    activeDBs = property(_get_active_db, doc="returns the active DBs only")

    def read_registry(self):
        """Reads and returns the active registry

        """
        url = 'registry/registry?action=ACTIVE&format=txt'
        res = self.http_get(url, frmt='txt')
        return res.split()

    def print_status(self):
        """Prints the services that are available

        :return: Nothing

        The output is tabulated. The columns are:

        * names
        * active
        * count
        * version
        * rest URL
        * soap URL
        * rest example
        * restricted

        .. seealso:: If you want the data into lists, see all attributes
            starting with registry such as :meth:`registry_names`
        """
        url = 'registry/registry?action=STATUS&format=xml'
        res = self.http_get(url, frmt="txt")

        names = self.registry_names
        counts = self.registry_counts
        versions = self.registry_versions
        actives = self.registry_actives
        resturls = self.registry_resturls
        soapurls = self.registry_soapurls
        restexs = self.registry_restexamples
        restricted = self.registry_restricted
        N = len(names)

        indices = sorted(range(0,N), key=lambda k: names[k])

        for i in range(0,N):
            print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i],
                counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i]))


    # todo a property for the version of PISCQUIC

    def _get_registry(self):
        if self._registry is None:
            url = 'registry/registry?action=STATUS&format=xml'
            res = self.http_get(url, frmt="xml")
            res = self.easyXML(res)
            self._registry = res
        return self._registry
    registry = property(_get_registry, doc="returns the registry of psicquic")

    def _get_registry_names(self):
        res = self.registry
        return [x.findAll('name')[0].text for x in res.findAll("service")]
    registry_names = property(_get_registry_names,
            doc="returns all services available (names)")

    def _get_registry_restricted(self):
        res = self.registry
        return [x.findAll('restricted')[0].text for x in res.findAll("service")]
    registry_restricted = property(_get_registry_restricted,
            doc="returns restricted status of services")

    def _get_registry_resturl(self):
        res = self.registry
        data = [x.findAll('resturl')[0].text for x in res.findAll("service")]
        return data
    registry_resturls = property(_get_registry_resturl,
            doc="returns URL of REST services")

    def _get_registry_restex(self):
        res = self.registry
        data = [x.findAll('restexample')[0].text for x in res.findAll("service")]
        return data
    registry_restexamples = property(_get_registry_restex,
            doc="retuns REST example for each service")

    def _get_registry_soapurl(self):
        res = self.registry
        return  [x.findAll('soapurl')[0].text for x in res.findAll("service")]
    registry_soapurls = property(_get_registry_soapurl,
            doc="returns URL of WSDL service")

    def _get_registry_active(self):
        res = self.registry
        return  [x.findAll('active')[0].text for x in res.findAll("service")]
    registry_actives = property(_get_registry_active,
            doc="returns active state of each service")

    def _get_registry_count(self):
        res = self.registry
        return  [x.findAll('count')[0].text for x in res.findAll("service")]
    registry_counts = property(_get_registry_count,
            doc="returns number of entries in each service")

    def _get_registry_version(self):
        res = self.registry
        names = [x.findAll('name')[0].text for x in res.findAll("service")]
        N = len(names)
        version = [0] * N
        for i in range(0,N):
            x = res.findAll("service")[i]
            if x.findAll("version"):
                version[i] = x.findAll("version")[0].text
            else:
                version[i] = None
        return  version
    registry_versions = property(_get_registry_version,
            doc="returns version of each service")

    def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None):
        """Send a query to a specific database

        :param str service: a registered service. See :attr:`registry_names`.
        :param str query: a valid query. Can be `*` or a protein name.
        :param str output: a valid format. See s._formats

        ::

            s.query("intact", "brca2", "tab27")
            s.query("intact", "zap70", "xml25")
            s.query("matrixdb", "*", "xml25")

        This is the programmatic approach to this website:

        http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml


        Another example consist in accessing the *string* database for fetching
        protein-protein interaction data of a particular model organism. Here we
        restrict the query to 100 results::

            s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25")

        # spaces are automatically converted

            s.query("biogrid", "ZAP70 AND species:9606")

        .. warning:: AND must be in big caps. Some database are ore permissive
            than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more
            permissive and may accept the name (e.g., human)

        To obtain the number of interactions in intact for the human specy::

            >>> len(p.query("intact", "species:9606"))


        """
        if service not in self.activeDBs:
            raise ValueError("database %s not in active databases" % service)

        params = {}
        if output is not None:
            self.devtools.check_param_in_list(output, self.formats)
            params['format'] = output
        else: output="none"

        names = [x.lower() for x in self.registry_names]
        try:
            index = names.index(service)
        except ValueError:
            self.logging.error("The service you gave (%s) is not registered. See self.registery_names" % service)
            raise ValueError

        # get the base url according to the service requested
        resturl = self.registry_resturls[index]

        if firstResult is not None:
            params['firstResult'] = firstResult
        if maxResults is not None:
            params['maxResults'] = maxResults

        url = resturl  + 'query/' + query

        if "xml" in output:
            res = self.http_get(url, frmt="xml", params=params)
        else:
            res = self.http_get(url, frmt="txt", params=params)
            res = res.strip().split("\n")

        if output.startswith("tab"):
            res = self._convert_tab2dict(res)

        return res


    def _convert_tab2dict(self, data):
        """

        https://code.google.com/p/psicquic/wiki/MITAB26Format
        """
        results = []
        for line in data:
            results.append(line.split("\t"))

        return results


    def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None):
        """Same as query but runs on all active database

        :param list databases: database to query. Queries all active DB if not provided
        :return: dictionary where keys correspond to databases and values to the output of the query.

        ::

            res = s.queryAll("ZAP70 AND species:9606")
        """

        results = {}
        if databases is None:
            databases = [x.lower() for x in self.activeDBs]

        for x in databases:
            if x not in self.activeDBs:
                raise ValueError("database %s not in active databases" % x)


        for name in databases:
            self.logging.warning("Querying %s" % name),
            res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults)
            if output.startswith("tab25"):
                results[name] = [x for x in res if x!=[""]]
            else:
                import copy
                results[name] = copy.copy(res)
        for name in databases:
            self.logging.info("Found %s in %s" % (len(results[name]), name))
        return results



    def getInteractionCounter(self, query):
        """Returns a dictionary with database as key and results as values

        :param str query: a valid query
        :return: a dictionary which key as database and value as number of entries

        Consider only the active database.

        """
        # get the active names only
        activeDBs = self.activeDBs[:]
        res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs]
        return dict(res)

    def getName(self, data):
        idsA = [x[0] for x in data]
        idsB = [x[1] for x in data]
        return idsA, idsB

    def knownName(self, data):
        """Scan all entries (MITAB) and returns simplified version


        Each item in the input list of mitab entry
        The output is made of 2 lists corresponding to
        interactor A and B found in the mitab entries.

        elements in the input list takes the following forms::

            DB1:ID1|DB2:ID2
            DB3:ID3

        The | sign separates equivalent IDs from different databases.

        We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known
        database is found, then we keep the first one whatsover.

        known databases are those available in the uniprot mapping tools.

        chembl and chebi IDs are kept unchanged.


        """


        self.logging.info("converting data into known names")
        idsA = [x[0].replace("\"","") for x in data]
        idsB = [x[1].replace("\"", "") for x in data]
        # extract the first and second ID but let us check if it is part of a
        # known uniprot mapping.Otherwise no conversion will be possible.
        # If so, we set the ID to "unknown"
        # remove the " character that can be found in a few cases (e.g,
        # chebi:"CHEBI:29036")
        #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA]
        #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB]

        # special case:
        # in mint, there is an entry that ends with a | uniprotkb:P17844|
        idsA = [x.strip("|") for x in idsA]
        idsB = [x.strip("|") for x in idsB]


        # the first ID
        for i, entry in enumerate(idsA):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs)>=1:
                    idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug("none of the DB for this entry (%s) are available" % (entry))
                    idsA[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsA[i] = "??:" + entry  # we add a : so that we are sure that a split(":") will work
        # the second ID
        for i, entry in enumerate(idsB):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs)>=1:
                    idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug("none of the DB (%s) for this entry are available" % (entry))
                    idsB[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsB[i] = "??:" + entry

        countA = len([x for x in idsA if x.startswith("?")])
        countB = len([x for x in idsB if x.startswith("?")])
        if countA+countB > 0:
            self.logging.warning("%s ids out of %s were not identified" % (countA+countB, len(idsA)*2))
            print(set([x.split(":")[0] for x in idsA if x.startswith("?")]))
            print(set([x.split(":")[0] for x in idsB if x.startswith("?")]))
        self.logging.info("knownName done")
        return idsA, idsB

    def preCleaning(self, data):
        """remove entries ehre IdA or IdB is set to "-"

        """
        ret = [x for x in data if x[0] !="-" and x[1]!="-"]
        return ret

    def postCleaningAll(self,data, keep_only="HUMAN", flatten=True, verbose=True):
        """

        even more cleaing by ignoring score, db and interaction
        len(set([(x[0],x[1]) for x in retnew]))
        """
        results = {}
        for k in data.keys():
            self.logging.info("Post cleaning %s" % k)
            ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose)
            if len(ret):
                results[k] = ret
        if flatten:
            results = [x for k in results.keys() for x in results[k]]
        return results

    def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi","chembl"],
        keep_self_loop=False, verbose=True):
        """Remove entries with a None and keep only those with the keep pattern

        """
        if verbose:print("Before removing anything: ", len(data))

        data = [x for x in data if x[0] is not None and x[1] is not None]
        if verbose:print("After removing the None: ", len(data))

        data = [x for x in data if x[0].startswith("!")is False and x[1].startswith("!")is False]
        if verbose:print("After removing the !: ", len(data))


        for db in remove_db:
            data = [x for x in data if x[0].startswith(db)is False]
            data = [x for x in data if x[1].startswith(db)is False]
            if verbose:print("After removing entries that match %s : " % db, len(data))

        data = [x for x in data if keep_only in x[0] and keep_only in x[1]]
        if verbose:print("After removing entries that don't match %s : " % keep_only, len(data))

        if keep_self_loop is False:
            data = [x for x in data if x[0]!=x[1]]
            if verbose:print("After removing self loop : ", len(data))

        data = list(set(data))
        if verbose:print("After removing identical entries", len(data))

        return data

    def convertAll(self, data):
        results = {}
        for k in data.keys():
            self.logging.info("Analysing %s" % k)
            results[k] = self.convert(data[k], db=k)
        return results

    def convert(self, data, db=None):
        self.logging.debug("converting the database %s" % db)
        idsA, idsB = self.knownName(data)
        mapping = self.mappingOneDB(data)
        results = []
        for i, entry in enumerate(data):
            x = idsA[i].split(":",1)[1]
            y = idsB[i].split(":",1)[1]
            xp = mapping[x]
            yp = mapping[y]
            try:ref = entry[8]
            except:ref="?"
            try:score = entry[14]
            except:score = "?"
            try:interaction = entry[11]
            except:interaction="?"
            results.append((xp, yp, score, interaction, ref, db))
        return results

    def mappingOneDB(self, data):
        query = {}
        self.logging.debug("converting IDs with proper DB name (knownName function)")
        entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id
        # the db is known from _mapping.uniprot otherwise it is called "unknown"

        # get unique DBs to build the query dictionary
        dbsA = [x.split(":")[0] for x in entriesA]
        dbsB = [x.split(":")[0] for x in entriesB]
        for x in set(dbsA):
            query[x] = set()
        for x in set(dbsB):
            query[x] = set()
        for k in query.keys():
            if k.startswith("?"):
                del query[k]

        # the data to store
        mapping = {}
        N = len(data)

        # scan all entries
        counter = 0
        for entryA, entryB in zip(entriesA, entriesB):
            counter += 1
            dbA, idA = entryA.split(":")
            try:
                dbB, idB = entryB.split(":")
            except:
                print(entryB)
            if idA not in mapping.keys():
                if dbA.startswith("?"):
                    mapping[idA] = entryA
                else:
                    query[dbA].add(idA)
            if idB not in mapping.keys():
                if dbB.startswith("?"):
                    mapping[idB] = entryB
                else:
                    query[dbB].add(idB)

            for k in query.keys():
                if len(query[k])>2000 or counter == N:
                    this_query = list(query[k])
                    DBname = self._mapping_uniprot[k]

                    if DBname is not None:
                        self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N))
                        res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query))
                        for x in this_query:
                            if x not in res: #was not found
                                mapping[x] = "!" + k+":"+x
                            else:
                                # we should be here since the queries are populated
                                # if not already in the mapping dictionary
                                if x not in res.keys():
                                    raise ValueError(x)
                                if len(res[x])==1:
                                    mapping[x] = res[x][0]
                                else:
                                    self.logging.warning("psicquic mapping found more than 1 id. keep first one")
                                    mapping[x] = res[x][0]
                    else:
                        for x in this_query:
                            mapping[x] = k + ":" + x
                    query[k] = set()

        for k in query.keys():
            assert len(query[k])==0
        return mapping
示例#6
0
class PSICQUIC(RESTService):
    """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service

    There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used
    the REST only.


    This service provides a common interface to more than 25 other services
    related to protein. So, we won't detail all the possiblity of this service.
    Here is an example that consists of looking for interactors of the
    protein ZAP70 within the IntAct database::

        >>> from bioservices import *
        >>> s = PSICQUIC()
        >>> res = s.query("intact", "zap70")
        >>> len(res) # there are 11 interactions found
        11
        >>> # Let us look at the second one in particular:
        >>> for x in res[1].split("\t"): 
        ...     print x
        uniprotkb:O95169
        uniprotkb:P43403
        intact:EBI-716238
        intact:EBI-1211276
        psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI
        .
        .

    Here we have a list of entries. There are 15 of them (depending on
    the *output* parameter). The meaning of the entries is described on PSICQUIC
    website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short:

    
    #. Unique identifier for interactor A
    #. Unique identifier for interactor B.
    #. Alternative identifier for interactor A, for example the official gene
    #. Alternative identifier for interactor B.
    #. Aliases for A, separated by "|
    #. Aliases for B.
    #. Interaction detection methods, taken from the corresponding PSI-MI
    #. First author surname(s) of the publication(s) 
    #. Identifier of the publication 
    #. NCBI Taxonomy identifier for interactor A. 
    #. NCBI Taxonomy identifier for interactor B.
    #. Interaction types, 
    #. Source databases and identifiers, 
    #. Interaction identifier(s) i
    #. Confidence score. Denoted as scoreType:value. 



    Another example with reactome database::

        res = s.query("reactome", "Q9Y266")


    .. warning:: PSICQUIC gives access to 25 other services. We cannot create
        a dedicated parsing for all of them. So, the ::`query` method returns
        the raw data. Addition class may provide dedicated parsing in the
        future.

    .. seealso:: :class:`bioservices.biogrid.BioGRID`
    """

    _formats = ["tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml",
        "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"]


    # note the typo in "genbank indentifier from bind DB
    _mapping_uniprot = {"genbank indentifier": "P_GI",
        'entrezgene/locuslink':"P_ENTREZGENEID",
        'uniprotkb': "ACC+ID",
        'rcsb pdb':"PDB_ID",
        'ensembl':"ENSEMBL_ID",
        'refseq':"P_REFSEQ_AC",
        'hgnc':'HGNC_ID',
        "kegg": "KEGG_ID",
        "entrez gene/locuslink": "P_ENTREZGENEID",
        "chembl": "CHEMBL_ID",
        "ddbj/embl/genbank": "EMBL_ID",
        "dip": "DIP_ID",
        "ensemblgenomes": "ENSEMBLGENOME_ID",
        "omim":"MIM_ID",
        "chebi": None,
        "chembl": None,
#        "intact": None
    }

# unknown: hprd, omim, bind, bind complexid, mdl, 

    def __init__(self, verbose=True):
        """.. rubric:: Constructor

        :param bool verbose: print informative messages

        .. doctest:: 

            >>> from bioservices import PSICQUIC
            >>> s = PSICQUIC()

        """
        urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic'
        super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr)
        self._registry = None

        try:
            self.uniprot = UniProt(verbose=False)
        except:
            self.logging.warning("UniProt service could be be initialised")

        self.buffer = {}

    def _get_formats(self):
        return PSICQUIC._formats
    formats = property(_get_formats, doc="Returns the possible output formats")

    def _get_active_db(self):
        names = self.registry_names[:]
        actives = self.registry_actives[:]
        names = [x.lower() for x,y in zip(names, actives) if y=="true"]
        return names
    activeDBs = property(_get_active_db, doc="returns the active DBs only")


    def read_registry(self):
        """Reads and returns the active registry 

        """
        url = self.url + '/registry/registry?action=ACTIVE&format=txt'
        res = self.request(url, format='txt')
        return res.split()

    def print_status(self):
        """Prints the services that are available

        :return: Nothing

        The output is tabulated. The columns are:

        * names
        * active
        * count
        * version
        * rest URL
        * soap URL
        * rest example
        * restricted

        .. seealso:: If you want the data into lists, see all attributes
            starting with registry such as :meth:`registry_names`
        """
        url = self.url +  '/registry/registry?action=STATUS&format=xml'
        res = self.request(url)
        names = self.registry_names
        counts = self.registry_counts
        versions = self.registry_versions
        actives = self.registry_actives
        resturls = self.registry_resturls
        soapurls = self.registry_soapurls
        restexs = self.registry_restexamples
        restricted = self.registry_restricted
        N = len(names)

        indices = sorted(range(0,N), key=lambda k: names[k])

        for i in range(0,N):
            print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i], 
                counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i]))


    # todo a property for the version of PISCQUIC

    def _get_registry(self):
        if self._registry == None:
            url = self.url +  '/registry/registry?action=STATUS&format=xml'
            res = self.request(url, format="xml")
            self._registry = res
        return self._registry
    registry = property(_get_registry, doc="returns the registry of psicquic")

    def _get_registry_names(self):
        res = self.registry
        return [x.findAll('name')[0].text for x in res.findAll("service")]
    registry_names = property(_get_registry_names, doc="returns all services available (names)")

    def _get_registry_restricted(self):
        res = self.registry
        return [x.findAll('restricted')[0].text for x in res.findAll("service")]
    registry_restricted = property(_get_registry_restricted, doc="returns restricted status of services" )

    def _get_registry_resturl(self):
        res = self.registry
        data = [x.findAll('resturl')[0].text for x in res.findAll("service")]
        return data
    registry_resturls = property(_get_registry_resturl, doc="returns URL of REST services")

    def _get_registry_restex(self):
        res = self.registry
        data = [x.findAll('restexample')[0].text for x in res.findAll("service")]
        return data
    registry_restexamples = property(_get_registry_restex, doc="retuns REST example for each service")

    def _get_registry_soapurl(self):
        res = self.registry
        return  [x.findAll('soapurl')[0].text for x in res.findAll("service")]
    registry_soapurls = property(_get_registry_soapurl, doc="returns URL of WSDL service")

    def _get_registry_active(self):
        res = self.registry
        return  [x.findAll('active')[0].text for x in res.findAll("service")]
    registry_actives = property(_get_registry_active, doc="returns active state of each service")

    def _get_registry_count(self):
        res = self.registry
        return  [x.findAll('count')[0].text for x in res.findAll("service")]
    registry_counts = property(_get_registry_count, doc="returns number of entries in each service")

    def _get_registry_version(self):
        res = self.registry
        names = [x.findAll('name')[0].text for x in res.findAll("service")]
        N = len(names)
        version = [0] * N
        for i in range(0,N):
            x = res.findAll("service")[i]
            if x.findAll("version"):
                version[i] = x.findAll("version")[0].text
            else:
                version[i] = None 
        return  version
    registry_versions = property(_get_registry_version, doc="returns version of each service")

    def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None):
        """Send a query to a specific database 

        :param str service: a registered service. See :attr:`registry_names`.
        :param str query: a valid query. Can be `*` or a protein name.
        :param str output: a valid format. See s._formats

        ::

            s.query("intact", "brca2", "tab27")
            s.query("intact", "zap70", "xml25")
            s.query("matrixdb", "*", "xml25")

        This is the programmatic approach to this website:

        http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml


        Another example consist in accessing the *string* database for fetching 
        protein-protein interaction data of a particular model organism. Here we
        restrict the query to 100 results::

            s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25")

        # spaces are automatically converted

            s.query("biogrid", "ZAP70 AND species:9606")

        .. warning:: AND must be in big caps. Some database are ore permissive
            than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more
            permissive and may accept the name (e.g., human)

        To obtain the number of interactions in intact for the human specy:: 

            >>> len(p.query("intact", "species:9606"))


        """
        if service not in self.activeDBs:
            raise ValueError("database %s not in active databases" % service)

        params = {}
        if output!=None:
            self.checkParam(output, self.formats)
            params['format'] = output
        else: output="none"

        names = [x.lower() for x in self.registry_names]
        try:
            index = names.index(service)
        except ValueError:
            print("The service you gave (%s) is not registered. See self.registery_names" % service)
            raise ValueError

        # get the base url according to the service requested
        resturl = self.registry_resturls[index]

        if firstResult != None:
            params['firstResult'] = firstResult
        if maxResults != None:
            params['maxResults'] = maxResults

        postData = self.urlencode(params)

        url = resturl  + 'query/' + query.replace(" ", "%20")
        if params:
            url += "?" + postData


        if "xml" in output:
            res = self.request(url, format="xml", baseUrl=False)
        else:
            res = self.request(url, format="txt",baseUrl=False)
            res = res.strip().split("\n")

        if output.startswith("tab"):
            res = self._convert_tab2dict(res)

        return res


    def _convert_tab2dict(self, data):
        """

        https://code.google.com/p/psicquic/wiki/MITAB26Format
        """
        results = []
        for line in data:
            results.append(line.split("\t"))

        return results


    def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None):
        """Same as query but runs on all active database

        :param list databases: database to query. Queries all active DB if not provided
        :return: dictionary where keys correspond to databases and values to the output of the query.

        ::

            res = s.queryAll("ZAP70 AND species:9606")
        """

        results = {}
        if databases == None:
             databases = [x.lower() for x in self.activeDBs]

        for x in databases:
            if x not in self.activeDBs:
                raise ValueError("database %s not in active databases" % x)


        for name in databases:
            self.logging.warning("Querying %s" % name),
            res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults)
            if output.startswith("tab25"):
                results[name] = [x for x in res if x!=[""]]
            else:
                import copy
                results[name] = copy.copy(res)
        for name in databases:
            self.logging.info("Found %s in %s" % (len(results[name]), name))
        return results



    def getInteractionCounter(self, query):
        """Returns a dictionary with database as key and results as values

        :param str query: a valid query
        :return: a dictionary which key as database and value as number of entries 

        Consider only the active database.

        """
        # get the active names only
        activeDBs = self.activeDBs[:] 
        res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs]
        return dict(res)

    def getName(self, data):
        idsA = [x[0] for x in data]
        idsB = [x[1] for x in data]
        return idsA, idsB

    def knownName(self, data):
        """Scan all entries (MITAB) and returns simplified version


        Each item in the input list of mitab entry
        The output is made of 2 lists corresponding to 
        interactor A and B found in the mitab entries.

        elements in the input list takes the following forms::

            DB1:ID1|DB2:ID2
            DB3:ID3

        The | sign separates equivalent IDs from different databases. 

        We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known
        database is found, then we keep the first one whatsover.

        known databases are those available in the uniprot mapping tools. 

        chembl and chebi IDs are kept unchanged.


        """


        self.logging.info("converting data into known names")
        idsA = [x[0].replace("\"","") for x in data]
        idsB = [x[1].replace("\"", "") for x in data]
        # extract the first and second ID but let us check if it is part of a
        # known uniprot mapping.Otherwise no conversion will be possible.
        # If so, we set the ID to "unknown"
        # remove the " character that can be found in a few cases (e.g,
        # chebi:"CHEBI:29036")
        #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA]
        #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB]

        # special case:
        # in mint, there is an entry that ends with a | uniprotkb:P17844|
        idsA = [x.strip("|") for x in idsA]
        idsB = [x.strip("|") for x in idsB]


        # the first ID
        for i, entry in enumerate(idsA):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs)>=1:
                    idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug("none of the DB for this entry (%s) are available" % (entry))
                    idsA[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsA[i] = "??:" + entry  # we add a : so that we are sure that a split(":") will work
        # the second ID
        for i, entry in enumerate(idsB):
            try:
                dbs = [x.split(":")[0] for x in entry.split("|")]
                IDs = [x.split(":")[1] for x in entry.split("|")]
                valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()]
                # search for an existing DB
                if len(valid_dbs)>=1:
                    idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1]
                else:
                    self.logging.debug("none of the DB (%s) for this entry are available" % (entry))
                    idsB[i] = "?" + dbs[0] + ":" + IDs[0]
            except:
                self.logging.info("Could not extract name from %s" % entry)
                idsB[i] = "??:" + entry

        countA = len([x for x in idsA if x.startswith("?")])
        countB = len([x for x in idsB if x.startswith("?")])
        if countA+countB > 0:
            self.logging.warning("%s ids out of %s were not identified" % (countA+countB, len(idsA)*2))
            print (set([x.split(":")[0] for x in idsA if x.startswith("?")]))
            print (set([x.split(":")[0] for x in idsB if x.startswith("?")]))
        self.logging.info("knownName done")
        return idsA, idsB

    def preCleaning(self, data):
        """remove entries ehre IdA or IdB is set to "-"

        """
        ret = [x for x in data if x[0] !="-" and x[1]!="-"]
        return ret

    def postCleaningAll(self,data, keep_only="HUMAN", flatten=True, verbose=True):
        """
    
        even more cleaing by ignoring score, db and interaction
        len(set([(x[0],x[1]) for x in retnew]))
        """
        results = {}
        for k in data.keys():
            self.logging.info("Post cleaning %s" % k)
            ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose)
            if len(ret):
                results[k] = ret
        if flatten:
            results = [x for k in results.keys() for x in results[k]]
        return results

    def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi","chembl"], 
        keep_self_loop=False, verbose=True):
        """Remove entries with a None and keep only those with the keep pattern




        """
        if verbose:print("Before removing anything: ", len(data))

        data = [x for x in data if x[0]!=None and x[1]!=None]
        if verbose:print("After removing the None: ", len(data))
    
        data = [x for x in data if x[0].startswith("!")==False and x[1].startswith("!")==False]
        if verbose:print("After removing the !: ", len(data))

    
        for db in remove_db:
            data = [x for x in data if x[0].startswith(db)==False]
            data = [x for x in data if x[1].startswith(db)==False]
            if verbose:print("After removing entries that match %s : " % db, len(data))

        data = [x for x in data if keep_only in x[0] and keep_only in x[1]]
        if verbose:print("After removing entries that don't match %s : " % keep_only, len(data))
    
        if keep_self_loop == False:
            data = [x for x in data if x[0]!=x[1]]
            if verbose:print("After removing self loop : ", len(data))

        data = list(set(data))
        if verbose:print("After removing identical entries", len(data))



        return data


    def convertAll(self, data):
        results = {}
        for k in data.keys():
            self.logging.info("Analysing %s" % k)
            results[k] = self.convert(data[k], db=k)
        return results

    def convert(self, data, db=None):
        self.logging.debug("converting the database %s" % db)
        idsA, idsB = self.knownName(data)
        mapping = self.mappingOneDB(data)
        results = []
        for i, entry in enumerate(data):
            x = idsA[i].split(":",1)[1]
            y = idsB[i].split(":",1)[1]
            xp = mapping[x]
            yp = mapping[y]
            try:ref = entry[8]
            except:ref="?"
            try:score = entry[14]
            except:score = "?"
            try:interaction = entry[11]
            except:interaction="?"
            results.append((xp, yp, score, interaction, ref, db))
        return results


    def mappingOneDB(self, data):
        query = {}
        self.logging.debug("converting IDs with proper DB name (knownName function)")
        entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id
        # the db is known from _mapping.uniprot otherwise it is called "unknown"

        # get unique DBs to build the query dictionary
        dbsA = [x.split(":")[0] for x in entriesA]
        dbsB = [x.split(":")[0] for x in entriesB]
        for x in set(dbsA):
            query[x] = set()
        for x in set(dbsB):
            query[x] = set()
        for k in query.keys():
            if k.startswith("?"):
                del query[k]

        # the data to store
        mapping = {}
        N = len(data)

        # scan all entries
        counter = 0
        for entryA, entryB in zip(entriesA, entriesB):
            counter += 1
            dbA, idA = entryA.split(":")
            try:
                dbB, idB = entryB.split(":")
            except:
                print entryB
            if idA not in mapping.keys():
                if dbA.startswith("?"):
                    mapping[idA] = entryA
                else:
                    query[dbA].add(idA)
            if idB not in mapping.keys():
                if dbB.startswith("?"):
                    mapping[idB] = entryB
                else:
                    query[dbB].add(idB)

            for k in query.keys():
                if len(query[k])>2000 or counter == N:
                    this_query = list(query[k])
                    DBname = self._mapping_uniprot[k]

                    if DBname != None:
                        self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N))
                        res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query))
                        for x in this_query:
                            if x not in res: #was not found
                                mapping[x] = "!" + k+":"+x
                            else:
                                # we should be here since the queries are populated
                                # if not already in the mapping dictionary
                                if x == mapping.keys():
                                    raise ValueError(x)
                                index = res.index(x)
                                mapping[x] = res[index+1]
                    else:
                        for x in this_query:
                            mapping[x] = k + ":" + x
                    query[k] = set()

        for k in query.keys():
            assert len(query[k])==0
        return mapping
示例#7
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    outputfile = open(args.out, "w")
    # Output header
    outputfile.write("chr\tpos\tid\tref\talt\tgene\tfeature\tfeature_type\tconsequence\tswissprotid\tuniprotid\tpdbid\tprotein_position\tamino_acid\n")

    vcf_row = {}

    #Interface to the UniProt service
    u = UniProt(verbose=False)

    vcf_reader = vcf.Reader(open(args.vcf, 'r'))
    ENSP_PDB_UNIPROT_mapping_DataFram = pd.DataFrame(columns=['ENSP','UniProtID','PDB'])
    #creating a util function to store mapping of Uniprot and PDB_ID
    for record in vcf_reader:
        # VEP fields
        curr_ENSP = ''
        if "CSQ" in record.INFO:
            csq = record.INFO['CSQ']
            # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED
            # For going through annotations for all transcript
            for current_csq_element in csq:
                current_csq = current_csq_element.split('|')
                curr_ENSP = str(current_csq[26])
                if curr_ENSP != "":
                    # to get Protein ID given ENSP ID
                    current_protein_list = u.search(curr_ENSP,frmt="list")
                    for curr_protein in current_protein_list.split("\n"):
                        if curr_protein != "":
                            # to get PDB ID given protein id
                            mapping_Dictionary = u.mapping(fr="ID", to="PDB_ID", query=str(curr_protein))
                            if bool(mapping_Dictionary) == True :
                                if curr_ENSP not in ENSP_PDB_UNIPROT_mapping_DataFram.index:
                                    ENSP_PDB_UNIPROT_mapping_DataFram.loc[curr_ENSP] = pd.Series({'ENSP':curr_ENSP, 'UniProtID':mapping_Dictionary.keys(), 'PDB':mapping_Dictionary.values()})

    #print(ENSP_PDB_UNIPROT_mapping_DataFram)
    # writing in a csv file
    for record in vcf_reader:
        current_chr = record.CHROM
        current_id = record.ID
        current_pos = record.POS
        current_ref = record.REF
        current_alt = ','.join(str(v) for v in record.ALT)

        # VEP fields
        current_gene, current_feature = '',''
        current_feature_type, current_consequence = '',''
        current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','',''
        if "CSQ" in record.INFO:
            csq = record.INFO['CSQ']

            # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED
            # For going through annotations for all transcript
            for current_csq_element in csq:
                current_csq = current_csq_element.split('|')
                current_consequence = current_csq[1]
                current_gene = current_csq[4]
                current_feature_type = current_csq[5]
                current_feature = current_csq[6]
                current_protein_position = current_csq[14]
                current_amino_acid = current_csq[15]
                current_ENSP = current_csq[26]
                current_swissport = current_csq[27]

                # only cosider missense mutation

                #if current_swissport_in_my_list(current_swissport, swissprot_pdb_)
                if current_ENSP in ENSP_PDB_UNIPROT_mapping_DataFram.index:
                    current_protein = ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['UniProtID']
                    for item in ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['PDB']:
                        current_pdbid = item
                        break;
                    out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                                current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]
                else:
                    current_protein = ""
                    current_pdbid = ""
                    out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                            current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]

                out_str = [x or 'None' for x in out_str]

                outputfile.write("\t".join(out_str))
                outputfile.write("\n")

        else:
            current_gene, current_feature = '',''
            current_feature_type, current_consequence = '',''
            current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','',''

            out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                        current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]
            out_str = [x or 'None' for x in out_str]
            outputfile.write("\t".join(out_str))
            outputfile.write("\n")

    outputfile.close()

    logging.info('Start.')
    logging.info('Command line: {}'.format(' '.join(sys.argv)))
示例#8
0
class Mapper(Logging):
    """Accepted code:

        uniprot


    m = Mapper()
    # HGNC
    df_hgnc = m.get_all_hgnc_into_df()
    df_hgnc.to_pickle("mapper_hgnc.dat")

    # KEGG
    df_kegg1 = m.get_all_kegg_into_df1()
    df_kegg2 = m.get_all_kegg_into_df2()

    uniq_keggid = 

    """
    kegg_dblinks = [
        "IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID",
        "UniProt", "Vega"
    ]
    hgnc_dblink = [
        'EC', 'Ensembl', 'EntrezGene', 'GDB', 'GENATLAS', 'GeneCards',
        'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP', 'HGNC', 'HORDE',
        'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF', 'MEROPS', 'Nucleotide',
        'OMIM', 'PubMed', 'RefSeq', 'Rfam', 'Treefam', 'UniProt', 'Vega',
        'miRNA', 'snoRNABase'
    ]

    def __init__(self, verbosity="INFO"):
        super(Mapper, self).__init__(level=verbosity)
        self.logging.info("Initialising the services")
        self.logging.info("... uniprots")
        self._uniprot_service = UniProt()

        self.logging.info("... KEGG")
        self._kegg_service = KeggParser(verbose=False)

        self.logging.info("... HGNC")
        self._hgnc_service = HGNC()

        self.logging.info("... UniChem")
        self._unichem_service = UniChem()

        self.logging.info("...BioDBNet")
        self._biodbnet = BioDBNet()

    def _uniprot2refseq(self, name):
        """

        There are 2 refseq alias: REFSEQ_NT_ID and P_REFSEQ_AC.

        Here, we use the first one to agree with wikipedia
        http://en.wikipedia.org/wiki/Protein_Kinase_B

        """
        return self._uniprot_service.mapping(fr="ACC",
                                             to="REFSEQ_NT_ID",
                                             query="P31749")

    def _update_uniprot_xref(self,
                             df,
                             xref=["HGNC_ID", "ENSEMBLE_ID",
                                   "P_ENTREZGENEID"]):
        """Update the dataframe using Uniprot to map indices onto cross
        reference databases


        """
        for ref in xref:
            print("Processing %s " % ref)
            res = self._uniprot_service.multi_mapping("ACC",
                                                      ref,
                                                      list(df.index),
                                                      timeout=10,
                                                      ntrials=5)
            if "%s__uniprot_mapping" % ref not in df.columns:
                thisdf = pd.DataFrame({"%s__uniprot_mapping": res.values()},
                                      index=res.keys())
                df = df.join(thisdf)
            else:
                for index in df.index:
                    if index in res.keys():
                        df.ix[index]["%s__uniprot_mapping" % ref] = res[index]

    def get_data_from_biodbnet(self, df_hgnc):
        """keys are unique Gene names
        
        input is made of the df based on HGNC data web services

        uniprot accession are duplicated sometimes. If som this is actually the
        iprimary accession entry and all secondary ones.


        e.g. ,
        
        ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11
        correspond actually to the primary one : Q8NFV4

        """
        b = biodbnet.BioDBNet()
        res2 = b.db2db("Gene Symbol", [
            "HGNC ID", "UniProt Accession", "UniProt Entry Name",
            "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID"
        ],
                       res.keys()[0:2000])

        import pandas as pd
        import StringIO
        c = pd.read_csv(StringIO.StringIO(res2),
                        delimiter="\t",
                        index_col="Gene Symbol")
        return c
示例#9
0
def get_more_node_ids(the_network, **kwargs):
    """ Script to add more identifiers to model notes
    based on the node.id

    Arguments:
     the_network: a Network object, modified in place

    kwargs:
     node_id_type: current type of ids used for the nodes.
      Currently can be Entrez Gene (GeneID) or any of the 
      options in the BioServices UniProt mappings
     mapping_types: a list of target mapping id types to include
     verbose:

    Returns:
     the_network 

    TODO: determine the best source db/module
    for pairings from bioservices
                  
    """
    continue_flag = True

    try:
        from bioservices import UniProt
        u = UniProt(verbose=False)
    except:
        print("No bioservices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    the_node_locations = the_network.get_node_locations()
    if len(the_node_locations) == 0:
        print 'The network has no nodes, exiting...'
        continue_flag = False

    if 'node_id_type' in kwargs:
        node_id_type = kwargs['node_id_type'] 
    else:
        node_id_type = "Entrez Gene (GeneID)"

    if 'mapping_types' in kwargs:
        mapping_types = kwargs['mapping_types'] 
    else:
        mapping_types = default_mapping_target_list

    if 'verbose' in kwargs:
        verbose = kwargs['verbose'] 
    else:
        verbose = True

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:
        query_string = ''
        model_node_ids = []
        for the_nodetype in the_network.nodetypes:
            model_node_ids += [x.id for x in the_nodetype.nodes]

        the_node_id_list_list = [[]]
        i = 0
        j = 0
        for the_node_id in model_node_ids:
            if (j + 1) % max_query_length == 0:
                the_node_id_list_list.append([])
                i += 1
                the_node_id_list_list[i] = []
                j = 0
            the_node_id_list_list[i].append(the_node_id)
            j += 1

        query_string_list = []
        for i, the_node_id_list in enumerate(the_node_id_list_list):
            query_string = ''
            for the_node_id in the_node_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_node_id
                else:
                    query_string = the_node_id
            query_string_list.append(query_string)

        
        for the_target_type in mapping_types:
            the_result = {}
            for the_query_string in query_string_list:
                the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string))
            if verbose:
                print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type))
            for the_nodetype in the_network.nodetypes:
                for the_node in the_nodetype.nodes:
                    if (the_node.id in the_result.keys()):
                        if len(the_result[the_node.id]) > 0:
                            the_node.notes[the_target_type] = the_result[the_node.id]
                        else:
                            the_node.notes[the_target_type] = []
                    else:
                        the_node.notes[the_target_type] = []

    return the_network
示例#10
0
def get_more_node_ids(the_network, **kwargs):
    """ Script to add more identifiers to model notes
    based on the node.id

    Arguments:
     the_network: a Network object, modified in place

    kwargs:
     node_id_type: current type of ids used for the nodes.
      Currently can be 'Entrez Gene (GeneID)' or any of the 
      options in the BioServices UniProt mappings
     mapping_types: a list of target mapping id types to include.
      Options can be viewed in core.parameters.py
      Note "Symbol" is an additional option for the
      officieal gene nomenclature symbol.
     email: optional, for NCBI queries.
     verbose: [True (default), False]
     

    Returns:
     the_network 

    TODO: determine the best source db/module
    for pairings from bioservices
                  
    """
    continue_flag = True
    valid_mapping_targets = available_mapping_target.keys() + ['Symbol']
    verbose = test_kwarg('verbose', kwargs, [True, False])

    try:
        from bioservices import UniProt
        # Don't want verbosity at this low of a level
        u = UniProt(verbose = False)
    except:
        print("No bioservices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    the_node_locations = the_network.get_node_locations()
    if len(the_node_locations) == 0:
        print 'The network has no nodes, exiting...'
        continue_flag = False

    if 'node_id_type' in kwargs:
        node_id_type = kwargs['node_id_type'] 
        if node_id_type == 'Symbol':
            print "'Symbol' is a special case, not yet able to query with this option, exiting..."
            continue_flag = False            
    else:
        print "No node id type specified, attempting to use 'Entrez Gene (GeneID)'"
        node_id_type = 'Entrez Gene (GeneID)'

    if 'mapping_types' in kwargs:
        mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets]
        if len(mapping_types) == 0:
            print('No valid mapping_types selected, exiting...')
            continue_flag = False
        elif 'Symbol' in mapping_types:
            if (('Entrez Gene (GeneID)' not in mapping_types) & (node_id_type != 'Entrez Gene (GeneID)')):
                print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..."
                continue_flag = False
    else:
        mapping_types = default_mapping_target_list

    if 'email' in kwargs:
        email = kwargs['email'] 
    else:
        email = ''

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:
        query_string = ''
        model_node_ids = []
        model_nodes = []
        for the_nodetype in the_network.nodetypes:
            model_nodes += [x for x in the_nodetype.nodes]

        the_node_id_list_list = [[]]
        i = 0
        j = 0
        for the_node in model_nodes:
            if (j + 1) % max_query_length == 0:
                the_node_id_list_list.append([])
                i += 1
                the_node_id_list_list[i] = []
                j = 0
            the_node_id_list_list[i].append(the_node.id)
            j += 1

        query_string_list = []
        for i, the_node_id_list in enumerate(the_node_id_list_list):
            query_string = ''
            for the_node_id in the_node_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_node_id
                else:
                    query_string = the_node_id
            query_string_list.append(query_string)

        
        for the_target_type in mapping_types:
            if the_target_type != 'Symbol':
                the_result = {}
                for the_query_string in query_string_list:
                    the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string))
                if verbose:
                    print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type))
                for the_node in model_nodes:
                    if (the_node.id in the_result.keys()):
                        if len(the_result[the_node.id]) > 0:
                            the_node.notes[the_target_type] = the_result[the_node.id]
                        else:
                            the_node.notes[the_target_type] = []
                    else:
                        the_node.notes[the_target_type] = []

        # To avoid a loss of information, we should also make 
        # sure queried IDs are returned in the appropriate 
        # field in case they weren't available in the database.
        if node_id_type in mapping_types:
            # Not yet supported anyway, but can leave this here.
            if node_id_type != 'Symbol':
                for the_node in model_nodes:
                    if the_node.id not in the_node.notes[node_id_type]:
                        the_node.notes[node_id_type].append(the_node.id)
                    
        if "Symbol" in mapping_types:
            if ((node_id_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)):
                the_entrez_to_query = []
                query_dict = {}
                for the_node in model_nodes:
                    query_dict[the_node.id] = {}
                    query_dict[the_node.id]["Entrez Gene (GeneID)"] = []
                    if node_id_type == "Entrez Gene (GeneID)":
                        query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_node.id)
                    if "Entrez Gene (GeneID)" in mapping_types:
                        the_entrez_list = the_node.notes["Entrez Gene (GeneID)"]
                        if len(the_entrez_list) > 0:
                            for the_entrez_id in the_entrez_list:
                                if the_entrez_id not in query_dict[the_node.id]["Entrez Gene (GeneID)"]:
                                    query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_entrez_id)
                    the_entrez_to_query += query_dict[the_node.id]["Entrez Gene (GeneID)"]
                the_entrez_to_query = list(set(the_entrez_to_query))
                the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose)
                for the_node in model_nodes:
                    the_node.notes["Symbol"] = []
                    for the_entrez_id in query_dict[the_node.id]["Entrez Gene (GeneID)"]:
                        the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol']
                        if len(the_symbol_id) > 0:
                            the_node.notes["Symbol"].append(the_symbol_id)
                print("**Finished mapping for %s to %s.**" % (node_id_type, "Symbol"))
            elif verbose:
                print "'Entrez Gene (GeneID)' mappings are needed first in order to query symbols, skipping..."

    return the_network
示例#11
0
def convert_gene_ids_bt(xml_file_in, id_identity = None, id_formatter = None, translate_file = None):
    """
    Replace all found instances of old gene IDs to new IDs.
    N.B. It will only look at 'GENE ASSOCIATION' lines.
    'translate_file' should be 2 column tsv file.
    """    
        
    ## Create ID conversion dictionary for MetaCyc 
    
    translate_file = translate_file or '/Users/wbryant/Dropbox/Bacteroides/BioCyc_-_Protein-Gene-relations/BioCyc_BT_-_Protein-Gene-relations.txt' 
    trans_in = open(translate_file,'r')
    id_dict = {}
    for line in trans_in:
        ids = line.split("\t")
        if len(ids[1]) > 0:
            id_dict[ids[0]] = ids[1].strip()
    
    
    id_identity = id_identity or model_metacyc_identifier
    id_formatter = id_formatter or model_metacyc_gene_2_biocyc
    
    
    ## Create gene -> locus dictionary from NCBI file
    
    ncbi_gene_file = '/Users/wbryant/work/BTH/data/NCBI/gene_list.dat'
    ncbi_in = open(ncbi_gene_file,'r')
    ncbi_id_dict = {}
    for line in ncbi_in:
        if re.search('[0-9]+\.[ ].+',line):
            ncbi_id = line.strip().split(" ")[-1]
        elif 'Other Aliases' in line:
            bt_ids = re.findall('BT\_[0-9]+',line)
            for bt_id in bt_ids:
                ncbi_id_dict[ncbi_id] = bt_id
    ncbi_in.close()
    
    
    ## Some specific UniProt IDs do not map - so put them here manually:
    
    uniprot_manual_dict = {}
    
    uniprot_manual_dict['Q8A1G3_BACTN'] = 'BT_3698'
    uniprot_manual_dict['G8JZS4_BACTN'] = 'BT_3703'
    uniprot_manual_dict['Q8A1G0_BACTN'] = 'BT_3704'
    uniprot_manual_dict['Q89YR9_BACTN'] = 'BT_4662'
    

    ### Run through lines of input file replacing relevant gene IDs with new gene IDs
    
    u = UniProt(verbose=False)
    xml_file_out = re.sub('\.xml','_out.xml',xml_file_in)
    f_in = open(xml_file_in,'r')
    f_out = open(xml_file_out,'w') 
    for line in f_in:
        if 'GENE ASSOCIATION' in line:
            ## Look for genes fitting id_identity, convert and replace
            
            #print line
            
            ###! Change!
            line = re.sub('(\<[^\>]+\>[ \n]*$)',' \g<1>',line)
            
            old_ids = re.findall(id_identity, line)
            
            if len(old_ids) > 0:
                #print old_ids.groups(1)
                for old_id in old_ids:
                    old_id_formatted = id_formatter(old_id)
                    try:
                        new_id = id_dict[old_id_formatted]
                    except:
                        new_id = old_id_formatted
                        print("ID '%s' not found ..." % new_id)
                    #print("%20s: %20s" % (old_id, new_id))
                    line = line.replace(old_id,new_id,1)
            
            ## Remove extraneous gene surrounds
            line = re.sub('\(gene\:([^\)]+)_i\)','\g<1>',line)
            
            
            ## Look for UniProt genes and convert
            if 'uniprot' in line:
                
                uniprot_entries = re.findall('\(uniprot\:[^\)]+\)',line)
                
                for uniprot_entry in uniprot_entries:
                    ## Map IDs 
                    
                    uniprot_id = re.sub('\(uniprot\:([^\)]+)\)','\g<1>',uniprot_entry)
                    
                    try:
                        new_entry = u.mapping(fr='ACC',to='KEGG_ID',query=uniprot_id)[uniprot_id][0]
                    except:
                        print("Protein ID '%s' not found in mapping, trying local ..." % uniprot_id)
                        try:
                            new_entry = uniprot_manual_dict[uniprot_id]
                        except:
                            print("Protein ID '%s' not found in local ..." % uniprot_id)
                            new_entry = uniprot_id
                    
                    
                    new_id = re.sub('bth\:([^\)]+)','\g<1>',new_entry)
                    line = line.replace(uniprot_entry,new_id,1)
                    
                    #u.mapping(fr='BIOCYC_ID',to='KEGG_ID',query='GJXV-2505')
            
            
        
            
            ## Get gene string
            line_groups = re.search('(.+GENE ASSOCIATION\:[ ]*)(.+)([ ]*\<.+)',line)
            gene_string = line_groups.group(2)
            
            
            if '_BACTN' in gene_string:
                print gene_string
            
            ## Look for NCBI IDs (like susG) and replace with BT IDs
            potential_ncbis = re.findall('[a-zA-Z0-9\_]+',gene_string)
            if '_BACTN' in gene_string:
                print ", ".join(potential_ncbis)
            for potential_ncbi in potential_ncbis:
                if potential_ncbi in ncbi_id_dict:
                    new_id = ncbi_id_dict[potential_ncbi]
                    gene_string = gene_string.replace(potential_ncbi,new_id,1)
                elif potential_ncbi in uniprot_manual_dict:
                    new_id = uniprot_manual_dict[potential_ncbi]
                    gene_string = gene_string.replace(potential_ncbi,new_id,1)
            
            
            ##Remove duplicates
            gene_list = gene_string.split(" or ")
            gene_list = list(set(gene_list))
            
            
            ## Reconstitute line
            line = line_groups.group(1)
            line += " or ".join(gene_list)
            line += line_groups.group(3)
            
            f_out.write(line)
            
        else:
            f_out.write(line)
    
    f_out.close()
示例#12
0
def batch_map(accessions,
              fr='ACC+ID',
              allow_download=False,
              cache=False,
              session=None,
              keep_unreviewed=True,
              match_taxon_id=9606,
              verbose=False):
    """
    Map a list of accessions using the UniProt batch mapping service.

    Parameters
    ----------
    accessions : list
        List of accessions.

    fr : str, optional
        Database to map from. See :class:`bioservices.UniProt`.

    keep_unreviewed : bool, optional
        If True, keep the unreviewed accession in mapping.

    allow_download : bool, optional
        If True, will download records that are missing for any accession
        in `accessions`.

    cache : bool, optional
        If True, `bioservices` cache will be used by 
        :class:`bioservices.UniProt`. Set to `False` to use the most up-to-date
        mappings.

    session : `scoped_session`, optional
        Session instance to save protein instances to if `allow_download`
        is True.

    match_taxon_id : int, optional
        Ignores mappings to or from proteins that do not match this id.

    verbose :  bool, optional
        Log info/warning/error messages to the console.

    Returns
    -------
    `dict`
        A dictionary of mappings from UniProt accessions to the most
        up-to-date UniProt accessions. Dictionary values are lists.
    """
    uniprot_mapper = UniProtMapper(cache=cache)
    filtered_mapping = {}
    mapping = uniprot_mapper.mapping(fr=fr, to='ACC', query=accessions)

    # No data was downloaded, try again a few times.
    if mapping == {}:
        for i in range(0, 4):
            mapping = uniprot_mapper.mapping(fr=fr, to='ACC', query=accessions)
            if mapping:
                break
            else:
                if verbose:
                    logger.warning(
                        "Could not download map from uniprot server. "
                        "Attempt {}/5. Re-attempt in 3 seconds.".format(i + 2))
                time.sleep(3)
    if mapping == {}:
        raise ValueError("Could not download map from uniprot server.")

    for fr, to in mapping.items():
        # Make sure any new accessions are in the database
        invalid_to = []
        for accession in to:
            # Check to see if a protein macthing accession and the
            # taxon id exists.
            entry = Protein.get_by_uniprot_id(accession)
            if entry is not None:
                if (match_taxon_id
                        is not None) and entry.taxon_id != match_taxon_id:
                    invalid_to.append(accession)
            else:
                if allow_download:
                    if verbose:
                        logger.info(
                            "Mapping to {}, but entry not found in database. "
                            "Attempting download.".format(accession))
                    record = download_record(accession,
                                             verbose=True,
                                             taxon_id=match_taxon_id)
                    protein = parse_record_into_protein(record)
                    if protein is not None:
                        protein.save(session, commit=True)
                    else:
                        if verbose:
                            logger.info(
                                "No valid record for {} was found".format(
                                    accession))
                        invalid_to.append(accession)
                else:
                    invalid_to.append(accession)

        to = [a for a in to if a not in invalid_to]
        status = [Protein.get_by_uniprot_id(a).reviewed for a in to]
        reviewed = [a for (a, s) in zip(to, status) if s is True]
        unreviewed = [a for (a, s) in zip(to, status) if s is False]
        targets = reviewed
        if keep_unreviewed:
            targets += unreviewed

        targets = list(set(targets))
        if not (match_taxon_id is None):
            taxon_ids = [
                Protein.get_by_uniprot_id(a).taxon_id for a in targets
            ]
            targets = [
                t for (t, taxon_id) in zip(targets, taxon_ids)
                if match_taxon_id == taxon_id
            ]
        filtered_mapping[fr] = list(sorted(targets))
    return filtered_mapping
示例#13
0
def keggid_to_uniprot(interactions, verbose=False, trembl=False, cache=False):
    """
    Map KEGG_ID accessions into uniprot. Performs a product operation
    to product multiple new interactions from a single interaction if
    multiple possible mappings are found.

    Parameters
    ----------
    interactions : :class:`pd.DataFrame`
        DataFrame with 'source', 'target', 'label', 'pubmed', and 
        'experiment_type' columns.

    trembl : bool, optional, default: False
        If True, during the mapping process, keeps mapped rows containing 
        TrEMBL accessions in either `source` or `target`. Otherwise, these
        rows are deleted.

    verbose : bool, optional, default: False
        If True, logs messages regarding mapping warnings and other information.

    cache : bool, optional, default: False
        If True, HTTP responses are cached by `bioservices`. This can save
        time but you will eventually miss out on new database releases if
        your cache is old.

    Returns
    -------
    `pd.DataFrame`
        DataFrame with 'source', 'target', 'label', 'pubmed', and 
        'experiment_type' columns.
    """
    filtered_map = {}
    sources = [a for a in interactions.source.values]
    targets = [b for b in interactions.target.values]
    unique_ids = list(set(sources) | set(targets))

    mapper = UniProt(cache=cache)
    mapping = mapper.mapping(fr='KEGG_ID', to='ACC', query=unique_ids)

    for kegg_id, uniprot_ls in mapping.items():
        # Check that the accessions are actually in the database.
        # If not, ignore them and warn the user.
        proteins_all = [Protein.get_by_uniprot_id(a) for a in uniprot_ls]
        proteins_valid = []
        zipped = list(zip(proteins_all, uniprot_ls))
        for p, accession in zipped:
            if p is None:
                uniprot_ls.remove(accession)
                if verbose:
                    logger.warning(
                        "No protein for '{}' found in the database. Consider "
                        "downloading the latest UniProt dat files and "
                        "updating the database.".format(accession)
                    )
            else:
                proteins_valid.append(p)

        # Only process the proteins in the database.
        status_ls = [p.reviewed for p in proteins_valid]
        status_ls = list(zip(uniprot_ls, status_ls))
        reviewed = [a for (a, s) in status_ls if s is True]
        unreviewed = [a for (a, s) in status_ls if s is False]
        if len(reviewed) > 0:
            if len(reviewed) > 1:
                if verbose:
                    logger.warning(
                        'More that one reviewed '
                        'acc found for {}: {}'.format(kegg_id, reviewed)
                    )
            filtered_map[kegg_id] = reviewed
        else:
            if verbose:
                logger.warning(
                    'No reviewed acc found for {}.'.format(kegg_id)
                )
            if trembl and len(unreviewed) > 0:
                if len(reviewed) > 1:
                    if verbose:
                        logger.warning(
                            'More that one unreviewed '
                            'acc found for {}: {}'.format(kegg_id, unreviewed)
                        )
                filtered_map[kegg_id] = unreviewed
            else:
                if verbose:
                    logger.warning('Could not map {}.'.format(kegg_id))

    # Remaining kegg_ids that have not mapped to anything go to None
    zipped = list(
        zip(
            interactions[SOURCE].values,
            interactions[TARGET].values,
            interactions[LABEL].values,
            interactions[PUBMED].values,
            interactions[EXPERIMENT_TYPE].values
        )
    )
    sources = []
    targets = []
    labels = []
    pmids = []
    psimis = []
    for source, target, label, pmid, psimi in zipped:
        source_acc = filtered_map.get(source, [])
        target_acc = filtered_map.get(target, [])

        # Some Kegg_Ids genuinely map to more than 1 distinct uniprot
        # accession, so we use a list product to account for this.
        ppis = product(source_acc, target_acc)
        for (s, t) in ppis:
            sources.append(s)
            targets.append(t)
            labels.append(label)
            pmids.append(pmid)
            psimis.append(psimi)

    interactions = make_interaction_frame(
        sources, targets, labels, pmids, psimis
    )
    return interactions
示例#14
0
def get_more_source_dict_ids(source_dict, primary_key, **kwargs):
    """ Script to add more ids to source dict nodes
    to facilitate pairing to a network

    Arguments:
     source_dict: id_key: value

     primary_key: current type of ids used for the nodes.
      Currently can be 'Entrez Gene (GeneID)' or any of the options 
      in the BioServices UniProt mappings.

    kwargs:
     mapping_types: a list of mapping types to include
     verbose

    Returns:
     source_dict, also modified in place

    
    """

    continue_flag = True

    file_key = primary_key
    if primary_key not in available_mapping_source.keys():
        continue_flag = False
        print "Error, you must specify a valid primary_key descriptor to match to in the available database, exiting..."

    if 'mapping_types' in kwargs:
        mapping_types = kwargs['mapping_types'] 
    else:
        mapping_types = default_mapping_target_list

    try:
        from bioservices import UniProt
        u = UniProt(verbose=False)
    except:
        print("No bioservices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    if 'node_id_type' in kwargs:
        node_id_type = kwargs['node_id_type'] 
    else:
        node_id_type = "Entrez Gene (GeneID)"

    if 'verbose' in kwargs:
        verbose = kwargs['verbose'] 
    else:
        verbose = True

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:

        the_query_id_list_list = [[]]
        i = 0
        j = 0
        for the_query_id in source_dict.keys():
            if (j + 1) % max_query_length == 0:
                the_query_id_list_list.append([])
                i += 1
                the_query_id_list_list[i] = []
                j = 0
            the_query_id_list_list[i].append(the_query_id)
            j += 1

        the_query_string_list = []
        for i, the_query_id_list in enumerate(the_query_id_list_list):
            query_string = ''
            for the_query_id in the_query_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_query_id
                else:
                    query_string = the_query_id
            the_query_string_list.append(query_string)

        for the_key in source_dict.keys():
            if type(source_dict[the_key]) != dict:
                the_value = source_dict[the_key]
                source_dict[the_key] = {}
                source_dict[the_key]['value'] = the_value
                
        for the_target_type in mapping_types:
            the_result = {}
            for the_query_string in the_query_string_list:
                the_result.update(u.mapping(fr = available_mapping_source[file_key], to = available_mapping_target[the_target_type], query = the_query_string))
            if verbose:
                print("** Finished mapping for %s to %s. **" % (file_key, the_target_type))
            for the_query_id in source_dict.keys():
                if the_query_id in the_result.keys():
                    if len(the_result[the_query_id]) > 0:
                        source_dict[the_query_id][the_target_type] = the_result[the_query_id]
                    else:
                        source_dict[the_query_id][the_target_type] = []
                else:
                    source_dict[the_query_id][the_target_type] = []

    return source_dict
示例#15
0
class Mapper(Logging):
    """Accepted code:

        uniprot


    m = Mapper()
    # HGNC
    df_hgnc = m.get_all_hgnc_into_df()
    df_hgnc.to_pickle("mapper_hgnc.dat")

    # KEGG
    df_kegg1 = m.get_all_kegg_into_df1()
    df_kegg2 = m.get_all_kegg_into_df2()

    uniq_keggid = 

    """
    kegg_dblinks  = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"]
    hgnc_dblink =  ['EC','Ensembl', 'EntrezGene', 'GDB', 'GENATLAS',
            'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP',
            'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF',
            'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam',
            'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase']


    def __init__(self, verbosity="INFO"):
        super(Mapper, self).__init__(level=verbosity)
        self.logging.info("Initialising the services")
        self.logging.info("... uniprots")
        self._uniprot_service = UniProt()

        self.logging.info("... KEGG")
        self._kegg_service = KeggParser(verbose=False)

        self.logging.info("... HGNC")
        self._hgnc_service = HGNC()

        self.logging.info("... UniChem")
        self._unichem_service = UniChem()

        self.logging.info("...BioDBNet")
        self._biodbnet = BioDBNet()

    def _uniprot2refseq(self, name):
        """

        There are 2 refseq alias: REFSEQ_NT_ID and P_REFSEQ_AC.

        Here, we use the first one to agree with wikipedia
        http://en.wikipedia.org/wiki/Protein_Kinase_B

        """
        return self._uniprot_service.mapping(fr="ACC", to="REFSEQ_NT_ID", query="P31749")

    def _update_uniprot_xref(self, df, 
            xref=["HGNC_ID", "ENSEMBLE_ID",  "P_ENTREZGENEID"]):
        """Update the dataframe using Uniprot to map indices onto cross
        reference databases


        """
        for ref in xref:
            print("Processing %s " % ref)
            res = self._uniprot_service.multi_mapping("ACC", ref,
                    list(df.index), timeout=10, ntrials=5)
            if "%s__uniprot_mapping" % ref not in df.columns:
                thisdf = pd.DataFrame({"%s__uniprot_mapping": res.values()},
                        index=res.keys())
                df = df.join(thisdf)
            else:
                for index in df.index:
                    if index in res.keys():
                        df.ix[index]["%s__uniprot_mapping" % ref] = res[index]

    def get_data_from_biodbnet(self, df_hgnc):
        """keys are unique Gene names
        
        input is made of the df based on HGNC data web services

        uniprot accession are duplicated sometimes. If som this is actually the
        iprimary accession entry and all secondary ones.


        e.g. ,
        
        ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11
        correspond actually to the primary one : Q8NFV4

        """
        b = biodbnet.BioDBNet()
        res2 = b.db2db("Gene Symbol", ["HGNC ID", "UniProt Accession", "UniProt Entry Name", "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID"], 
                res.keys()[0:2000])

        import pandas as pd
        import StringIO
        c = pd.read_csv(StringIO.StringIO(res2), delimiter="\t", index_col="Gene Symbol")
        return c
示例#16
0
def get_more_source_dict_ids(source_dict, primary_key_type, **kwargs):
    """ Script to add more ids to source dict nodes
    to facilitate pairing to a network

    Arguments:
     source_dict: id_key: value

     primary_key: current type of ids used for the top level dict key.
      Currently can be 'Entrez Gene (GeneID)' or any of the options 
      in the BioServices UniProt mappings.

    kwargs:
     mapping_types: a list of mapping types to include.
      See core.parameters for the full list.  Note
      'Symbol' is a special case for querying that depends
       on Entrez ID availability.
     verbose: [False (default), True]
     email: optional, for NCBI if querying for 'Symbol'

    Returns:
     source_dict, also modified in place

    
    """

    continue_flag = True
    verbose = test_kwarg('verbose', kwargs, [False, True])
    valid_mapping_targets = available_mapping_target.keys() + ['Symbol']

    if primary_key_type not in available_mapping_source.keys():
        if primary_key_type == 'Symbol':
            print "'Symbol' is a special case, not yet able to query with this as a primary key."
            print "Error, you must specify a valid primary_key_type descriptor to match to in the available database, exiting..."
        continue_flag = False   

    if 'mapping_types' in kwargs:
        mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets]
        if len(mapping_types) == 0:
            print('No valid mapping_types selected, exiting...')
            continue_flag = False
        elif 'Symbol' in mapping_types:
            if (('Entrez Gene (GeneID)' not in mapping_types) & (primary_key_type != 'Entrez Gene (GeneID)')):
                print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..."
                continue_flag = False
    else:
        mapping_types = default_mapping_target_list

    if 'email' in kwargs:
        email = kwargs['email'] 
    else:
        email = ''

    try:
        from bioservices import UniProt
        # Don't want verbosity at this low of a level
        u = UniProt(verbose = False)
    except ImportError:
        print("No BioServices module installed or cannot connect, exiting...")
        print("e.g. if you are using pip, did you 'pip install bioservices'?")
        continue_flag = False

    # Maximum number of items to
    # query at a time 
    # Note there is a length limit in bioservices 1.2.1
    # for the web-based query string.
    # Trial-and-error suggests the most
    # id's that can be queried are
    # between 100 and 1000
    max_query_length = 500

    if continue_flag:

        the_query_id_list_list = [[]]
        i = 0
        j = 0
        for the_query_id in source_dict.keys():
            if (j + 1) % max_query_length == 0:
                the_query_id_list_list.append([])
                i += 1
                the_query_id_list_list[i] = []
                j = 0
            the_query_id_list_list[i].append(the_query_id)
            j += 1

        the_query_string_list = []
        for i, the_query_id_list in enumerate(the_query_id_list_list):
            query_string = ''
            for the_query_id in the_query_id_list:
                if len(query_string) > 0:
                    query_string = query_string + ' ' + the_query_id
                else:
                    query_string = the_query_id
            the_query_string_list.append(query_string)

        for the_key in source_dict.keys():
            if type(source_dict[the_key]) != dict:
                the_value = source_dict[the_key]
                source_dict[the_key] = {}
                source_dict[the_key]['value'] = the_value
                
        for the_target_type in mapping_types:
            if the_target_type != 'Symbol':
                the_result = {}
                for the_query_string in the_query_string_list:
                    the_result.update(u.mapping(fr = available_mapping_source[primary_key_type], to = available_mapping_target[the_target_type], query = the_query_string))
                if verbose:
                    print("** Finished mapping for %s to %s. **" % (primary_key_type, the_target_type))
                for the_query_id in source_dict.keys():
                    if the_query_id in the_result.keys():
                        if len(the_result[the_query_id]) > 0:
                            source_dict[the_query_id][the_target_type] = the_result[the_query_id]
                        else:
                            source_dict[the_query_id][the_target_type] = []
                    else:
                        source_dict[the_query_id][the_target_type] = []

        # To avoid a loss of information, we should also make 
        # sure queried IDs are returned in the appropriate 
        # field in case they weren't available in the database.
        if primary_key_type in mapping_types:
            # Not yet supported but we can check to avoid breaking this
            if primary_key_type != 'Symbol':
                for the_source_dict_id in source_dict.keys():
                    if the_source_dict_id not in source_dict[the_source_dict_id][primary_key_type]:
                        source_dict[the_source_dict_id][primary_key_type].append(the_source_dict_id)
                    
        if "Symbol" in mapping_types:
            if ((primary_key_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)):
                the_entrez_to_query = []
                # Make query_dict in case "Entrez Gene (GeneID)" was 
                # a primary_key_type but not in mapping_types
                query_dict = {}
                for the_source_dict_id in source_dict.keys():
                    query_dict[the_source_dict_id] = {}
                    query_dict[the_source_dict_id]["Entrez Gene (GeneID)"] = []
                    if primary_key_type == "Entrez Gene (GeneID)":
                        query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_source_dict_id)
                    if "Entrez Gene (GeneID)" in mapping_types:
                        the_entrez_list = source_dict[the_source_dict_id]["Entrez Gene (GeneID)"]
                        if len(the_entrez_list) > 0:
                            for the_entrez_id in the_entrez_list:
                                if the_entrez_id not in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]:
                                    query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_entrez_id)
                    the_entrez_to_query += query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]
                the_entrez_to_query = list(set(the_entrez_to_query))
                the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose)
                for the_source_dict_id in source_dict.keys():
                    source_dict[the_source_dict_id]["Symbol"] = []
                    for the_entrez_id in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]:
                        the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol']
                        if len(the_symbol_id) > 0:
                            source_dict[the_source_dict_id]["Symbol"].append(the_symbol_id)
                print("**Finished mapping for %s to %s.**" % (primary_key_type, "Symbol"))
                        

    return source_dict
示例#17
0
def _id_map_(from_annotation, to_annotation, psm_protein_id, psm_hash, species,
             decoy_annotation, database_v):
    '''
    :param from_annotation: supplied annotation (i.e. swissprot)
    :param to_annotation: target annotation (i.e. ENSEMBL)
    :param psm_protein_id: list of protein IDS
    :param psm_hash: dictionairy of protein IDs mapped onto ENSEMBL
    :param species: species name
    :param decoy_annotation: list of decoy annotations
    :param database_v: database version
    :return: dictionairy of protein ID coversion
    '''
    #psm_hash.reset()
    new_psm_protein_id = []
    psm_protein_id = list(set(psm_protein_id))
    print "Commencing ID conversion from " + str(
        from_annotation) + " to " + str(to_annotation)
    map = {}

    # Convert RefSeq to ENSEMBL ID's
    if to_annotation == "ENSEMBL":
        if from_annotation == 'REFSEQ':
            temp_map = {}
            refseq_mrna = []
            refseq_pred_mrna = []
            refseq_ncrna = []
            refseq_pred_ncrna = []
            refseq_prot = []
            refseq_pred_prot = []

            for id in psm_protein_id:
                if "NM_" in id:
                    refseq_mrna.append(id)
                if "XM_" in id:
                    refseq_pred_mrna.append(id)
                if "NR_" in id:
                    refseq_ncrna.append(id)
                if "XR_" in id:
                    refseq_pred_ncrna.append(id)
                if "NP_" in id:
                    refseq_prot.append(id)
                if "XP_" in id:
                    refseq_pred_prot.append(id)
            mapped_id = []
            if refseq_mrna != []:
                print "Identified refseq mRNA ID's, converting:"
                mapped_id.append(
                    proBAM_biomart.id_map_ensembl("refseq_mrna", database_v,
                                                  species, refseq_mrna))
            if refseq_pred_mrna != []:
                print "Identified predicted refseq mRNA ID's, converting:"
                mapped_id.append(
                    proBAM_biomart.id_map_ensembl("refseq_mrna_predicted",
                                                  database_v, species,
                                                  refseq_pred_mrna))
            if refseq_ncrna != []:
                print "Identified refseq ncRNA ID's, converting:"
                mapped_id.append(
                    proBAM_biomart.id_map_ensembl("refseq_ncrna", database_v,
                                                  species, refseq_ncrna))
            if refseq_pred_ncrna != []:
                print "Identified refseq predicted ncRNA ID's, converting:"
                mapped_id.append(
                    proBAM_biomart.id_map_ensembl("refseq_ncrna_predicted",
                                                  database_v, species,
                                                  refseq_pred_ncrna))
            if refseq_prot != []:
                print "Identified refseq protein ID's, converting:"
                mapped_id.append(
                    proBAM_biomart.id_map_ensembl("refseq_peptide", database_v,
                                                  species, refseq_prot))
            if refseq_pred_prot != []:
                print "Identified refseq predicted protein ID's, converting:"
                mapped_id.append(
                    proBAM_biomart.id_map_ensembl("refseq_peptide_predicted",
                                                  database_v, species,
                                                  refseq_pred_prot))
            for row in mapped_id:
                if row[0] != "":
                    if row[2] in temp_map:
                        temp_map[row[2]].append(row)
                    else:
                        temp_map[row[2]] = [row]
            for key in temp_map:
                map[key] = temp_map[key][0]

        #Convert UNIPROT accession ID's to ENSEMBL
        if from_annotation == "UNIPROT":
            temp_map = {}
            #map uniprot/swissprot
            mapped_id = proBAM_biomart.id_map_ensembl("uniprot_swissprot",
                                                      database_v, species,
                                                      psm_protein_id)
            if len(mapped_id) > 1:
                for row in mapped_id:
                    if row[0] != "":
                        if row[2] in temp_map:
                            temp_map[row[2]].append(row)
                        else:
                            temp_map[row[2]] = [row]
                for key in temp_map:
                    map[key] = temp_map[key][0]

            #map remaining on uniprot/trembl
            unmapped_id_for_trmbl = []
            for id in psm_protein_id:
                if id not in map:
                    unmapped_id_for_trmbl.append(id)
            if unmapped_id_for_trmbl != []:
                mapped_id = proBAM_biomart.id_map_ensembl(
                    "uniprot_sptrembl", database_v, species,
                    unmapped_id_for_trmbl)
                if len(mapped_id) > 1:
                    for row in mapped_id:
                        if row[0] != "":
                            if row[2] in temp_map:
                                temp_map[row[2]].append(row)
                            else:
                                temp_map[row[2]] = [row]
                    for key in temp_map:
                        map[key] = temp_map[key][0]

        #Convert UNIPROT ENTRIES to ENSEMBL ID's
        if from_annotation == "UNIPROT_ENTRY":
            u = UniProt()
            to_translate = []
            for id in psm_protein_id:
                if re.findall(
                        "[A-Z0-9]{1,10}" + "_" +
                        _get_uniprot_postfix_(species), id) != []:
                    to_translate.append(
                        re.findall(
                            "[A-Z0-9]{1,10}" + "_" +
                            _get_uniprot_postfix_(species), id)[0])
            to_translate = list(set(to_translate))
            if len(to_translate) > 1000:
                nr_chunks = len(to_translate) / 1000
                chunks = chunkIt(to_translate, nr_chunks)
            else:
                chunks = [to_translate]
            # map uniprot_entries to up-to-date accession
            accession_update_hash = {}
            for chunk in chunks:
                chunk_accession_update_hash = u.mapping('ACC+ID', 'ACC', chunk)
                accession_update_hash.update(chunk_accession_update_hash)

            tot_count = 0
            for entry in to_translate:
                if entry not in accession_update_hash:
                    tot_count += 1

            print "\tFound "+str(tot_count)+" depreciated UniProt Entries (UniProt Entries are unstable)\n" \
                  "\tAttempting to map depreciated entries unto new entry ID's..."
            count = 0
            found = 0
            tracker = 9.99
            print "\t",
            for entry in to_translate:
                if entry not in accession_update_hash:
                    count += 1
                    if (float(count * 100) / float(tot_count)) > tracker:
                        print str(int(tracker + 0.01)) + "% ",
                        tracker = tracker + 10
                    try:
                        accession_update_hash[entry] = [
                            get_updated_entry_name(entry)
                        ]
                        found += 1
                    except urllib2.HTTPError:
                        pass
            print ' '
            to_translate = []
            print "\tRetrieved " + str(found) + " of the " + str(
                tot_count) + " depreciated UniProt Entries"

            #map accession to Ensembl transcript
            for value in accession_update_hash.values():
                to_translate += value
            to_translate = list(set(to_translate))
            if len(to_translate) > 1000:
                nr_chunks = len(to_translate) / 1000
                chunks = chunkIt(to_translate, nr_chunks)
            else:
                chunks = [to_translate]
            # map uniprot_entries to up-to-date accession
            temp_map = {}
            for chunk in chunks:
                # remap Ensembl transcript to uniprot entries
                chunk_temp_map = u.mapping('ACC', 'ENSEMBL_TRS_ID', chunk)
                temp_map.update(chunk_temp_map)
            for accession in accession_update_hash:
                for i in accession_update_hash[accession]:
                    if i in temp_map:
                        if accession in map:
                            map[accession] += temp_map[i]
                        else:
                            map[accession] = temp_map[i]
        #Get ENSEMBL ID's in correct form
        if from_annotation == "ENSEMBL":
            for id in psm_protein_id:
                map[id] = [id]
    return map
# Dernière modification: 17 décembre 2013
#
# Program qui obtient le uniref correspondant à un no d'accession genbank

from bioservices import UniProt
import sys
import os
from BeautifulSoup import BeautifulSoup

UNIREF_PATH = "../uniref/"

u = UniProt()
with open("uniref_mapping.txt", "w") as r:
    with open("resultatNBCI.txt", "r") as f:
        for line in f:
            temp = line.split("-|-")
            print("Traitement du contig " + temp[0])
            accession = temp[2].strip(" \t\n\r")
            u.mapping(fr='EMBL_ID', to='NF100', query=accession)
            res = u.search(accession, format='xml', limit=10)
            if res is '':
                r.write(temp[0] + "\tNone\n")
                print "aucun résultat pour ce contig"
            else:
                contig = temp[0].strip(" \t\n\r")
                with open(UNIREF_PATH+"result"+contig+".xml", "w") as xml:
                    xml.write(res)
                #xml = BeautifulSoup(res)
                r.write(contig + "\t Result\n")