def kegg_to_symbol_through_uniprot(unknown_genes): # create string to call uniprot for mapping search_string = '\t'.join(unknown_genes) kegg_to_gene_name = dict() missing = set() uniprot = UniProt(verbose=True) # This is where it gets tricky. Checking to see if there is a uniprot # mapping for the species, if not, trying from KEGG side. Sometimes # kegg links to a different uniprot, or uniprot links to a diff kegg. uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string)) for i in unknown_genes: if i in uni_dict: for n in uni_dict[i]: x = uniprot.search("accession:{}".format(n), columns='genes(PREFERRED),reviewed,id', limit=1) _, data = x.rstrip('\n').split('\n') name, review, entry = data.split('\t') if n != entry: print(i, n, entry, x, "dont match") elif review == 'reviewed': kegg_to_gene_name[i] = name else: missing.add(i) print("{} mappings not found from kegg to" " gene name".format(len(missing))) print(missing) return kegg_to_gene_name
def kegg_to_symbol_through_uniprot(unknown_genes): # create string to call uniprot for mapping search_string = '\t'.join(unknown_genes) kegg_to_gene_name = dict() missing = set() uniprot = UniProt(verbose=True) # This is where it gets tricky. Checking to see if there is a uniprot # mapping for the species, if not, trying from KEGG side. Sometimes # kegg links to a different uniprot, or uniprot links to a diff kegg. uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string)) for i in unknown_genes: if i in uni_dict: for n in uni_dict[i]: x = uniprot.search("accession:{}".format(n), columns='genes(PREFERRED),reviewed,id', limit=1) header, data = x.rstrip('\n').split('\n') name, review, entry = data.split('\t') if n != entry: print(i, n, entry, x, "dont match") elif review == 'reviewed': kegg_to_gene_name[i] = name else: missing.add(i) print("{} mappings not found from kegg to" " gene name".format(len(missing))) print(missing) return kegg_to_gene_name
def test_extract_protein_interactions_kgml(self, kgml_file, expected_no_rel): # Arrange sut = KeggProteinInteractionsExtractor() with open( os.path.join(os.path.dirname(os.path.realpath(__file__)), kgml_file), 'r') as myfile: kgml_string = myfile.read() # Mock Kegg ops mock_kegg = KEGG() sut.kegg = mock_kegg # No matter what the input is, return the ko numbers that map to hsa numbers mock_kegg.link = MagicMock(return_value="ko:K00922 hsa:5293\n" + "ko:K00922 hsa:5291\n" + "ko:K02649 hsa:5295") # No matter what the input is, return the hsa numbers that map to uniprot numbers mock_kegg.conv = MagicMock(return_value={"hsa:5293": "up:B0LPE5"}) # Mock Uni Prot mock_uniprot = UniProt() sut.uniprot = mock_uniprot mock_uniprot.mapping = MagicMock( return_value={"B0LPE5": ["gene1", "gene2"]}) # Act actual = sut.extract_protein_interactions_kgml(kgml_string) # Assert self.assertEqual(expected_no_rel, len(actual))
def find_gene(prot_id): u = UniProt(verbose=False) res = u.mapping("EMBL", "ACC", query=prot_id) for key, values in res.items(): for value in values: res = u.search(value, frmt="tab", limit=3, columns="genes", database='uniparc') genes = set(res[11:].split(';')) genes = [i for i in genes if (0<len(i) and i !='\n')] if len(genes)<1: genes = 'none' return key, genes return prot_id, 'none'
class PSICQUIC(REST): """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used the REST only. This service provides a common interface to more than 25 other services related to protein. So, we won't detail all the possiblity of this service. Here is an example that consists of looking for interactors of the protein ZAP70 within the IntAct database:: >>> from bioservices import * >>> s = PSICQUIC() >>> res = s.query("intact", "zap70") >>> len(res) # there are 11 interactions found 11 >>> for x in res[1]: ... print(x) uniprotkb:O95169 uniprotkb:P43403 intact:EBI-716238 intact:EBI-1211276 psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI . . Here we have a list of entries. There are 15 of them (depending on the *output* parameter). The meaning of the entries is described on PSICQUIC website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short: #. Unique identifier for interactor A #. Unique identifier for interactor B. #. Alternative identifier for interactor A, for example the official gene #. Alternative identifier for interactor B. #. Aliases for A, separated by "| #. Aliases for B. #. Interaction detection methods, taken from the corresponding PSI-MI #. First author surname(s) of the publication(s) #. Identifier of the publication #. NCBI Taxonomy identifier for interactor A. #. NCBI Taxonomy identifier for interactor B. #. Interaction types, #. Source databases and identifiers, #. Interaction identifier(s) i #. Confidence score. Denoted as scoreType:value. Another example with reactome database:: res = s.query("reactome", "Q9Y266") .. warning:: PSICQUIC gives access to 25 other services. We cannot create a dedicated parsing for all of them. So, the ::`query` method returns the raw data. Addition class may provide dedicated parsing in the future. .. seealso:: :class:`bioservices.biogrid.BioGRID` """ _formats = ["tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml", "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"] # note the typo in "genbank indentifier from bind DB _mapping_uniprot = {"genbank indentifier": "P_GI", 'entrezgene/locuslink':"P_ENTREZGENEID", 'uniprotkb': "ACC+ID", 'rcsb pdb':"PDB_ID", 'ensembl':"ENSEMBL_ID", 'refseq':"P_REFSEQ_AC", 'hgnc':'HGNC_ID', "kegg": "KEGG_ID", "entrez gene/locuslink": "P_ENTREZGENEID", "chembl": "CHEMBL_ID", "ddbj/embl/genbank": "EMBL_ID", "dip": "DIP_ID", "ensemblgenomes": "ENSEMBLGENOME_ID", "omim":"MIM_ID", "chebi": None, "chembl": None, # "intact": None } # unknown: hprd, omim, bind, bind complexid, mdl, def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: print informative messages .. doctest:: >>> from bioservices import PSICQUIC >>> s = PSICQUIC() """ urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic' super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr) self._registry = None try: self.uniprot = UniProt(verbose=False) except: self.logging.warning("UniProt service could be be initialised") self.buffer = {} def _get_formats(self): return PSICQUIC._formats formats = property(_get_formats, doc="Returns the possible output formats") def _get_active_db(self): names = self.registry_names[:] actives = self.registry_actives[:] names = [x.lower() for x,y in zip(names, actives) if y=="true"] return names activeDBs = property(_get_active_db, doc="returns the active DBs only") def read_registry(self): """Reads and returns the active registry """ url = 'registry/registry?action=ACTIVE&format=txt' res = self.http_get(url, frmt='txt') return res.split() def print_status(self): """Prints the services that are available :return: Nothing The output is tabulated. The columns are: * names * active * count * version * rest URL * soap URL * rest example * restricted .. seealso:: If you want the data into lists, see all attributes starting with registry such as :meth:`registry_names` """ url = 'registry/registry?action=STATUS&format=xml' res = self.http_get(url, frmt="txt") names = self.registry_names counts = self.registry_counts versions = self.registry_versions actives = self.registry_actives resturls = self.registry_resturls soapurls = self.registry_soapurls restexs = self.registry_restexamples restricted = self.registry_restricted N = len(names) indices = sorted(range(0,N), key=lambda k: names[k]) for i in range(0,N): print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i], counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i])) # todo a property for the version of PISCQUIC def _get_registry(self): if self._registry is None: url = 'registry/registry?action=STATUS&format=xml' res = self.http_get(url, frmt="xml") res = self.easyXML(res) self._registry = res return self._registry registry = property(_get_registry, doc="returns the registry of psicquic") def _get_registry_names(self): res = self.registry return [x.findAll('name')[0].text for x in res.findAll("service")] registry_names = property(_get_registry_names, doc="returns all services available (names)") def _get_registry_restricted(self): res = self.registry return [x.findAll('restricted')[0].text for x in res.findAll("service")] registry_restricted = property(_get_registry_restricted, doc="returns restricted status of services") def _get_registry_resturl(self): res = self.registry data = [x.findAll('resturl')[0].text for x in res.findAll("service")] return data registry_resturls = property(_get_registry_resturl, doc="returns URL of REST services") def _get_registry_restex(self): res = self.registry data = [x.findAll('restexample')[0].text for x in res.findAll("service")] return data registry_restexamples = property(_get_registry_restex, doc="retuns REST example for each service") def _get_registry_soapurl(self): res = self.registry return [x.findAll('soapurl')[0].text for x in res.findAll("service")] registry_soapurls = property(_get_registry_soapurl, doc="returns URL of WSDL service") def _get_registry_active(self): res = self.registry return [x.findAll('active')[0].text for x in res.findAll("service")] registry_actives = property(_get_registry_active, doc="returns active state of each service") def _get_registry_count(self): res = self.registry return [x.findAll('count')[0].text for x in res.findAll("service")] registry_counts = property(_get_registry_count, doc="returns number of entries in each service") def _get_registry_version(self): res = self.registry names = [x.findAll('name')[0].text for x in res.findAll("service")] N = len(names) version = [0] * N for i in range(0,N): x = res.findAll("service")[i] if x.findAll("version"): version[i] = x.findAll("version")[0].text else: version[i] = None return version registry_versions = property(_get_registry_version, doc="returns version of each service") def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None): """Send a query to a specific database :param str service: a registered service. See :attr:`registry_names`. :param str query: a valid query. Can be `*` or a protein name. :param str output: a valid format. See s._formats :: s.query("intact", "brca2", "tab27") s.query("intact", "zap70", "xml25") s.query("matrixdb", "*", "xml25") This is the programmatic approach to this website: http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml Another example consist in accessing the *string* database for fetching protein-protein interaction data of a particular model organism. Here we restrict the query to 100 results:: s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25") # spaces are automatically converted s.query("biogrid", "ZAP70 AND species:9606") .. warning:: AND must be in big caps. Some database are ore permissive than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more permissive and may accept the name (e.g., human) To obtain the number of interactions in intact for the human specy:: >>> len(p.query("intact", "species:9606")) """ if service not in self.activeDBs: raise ValueError("database %s not in active databases" % service) params = {} if output is not None: self.devtools.check_param_in_list(output, self.formats) params['format'] = output else: output="none" names = [x.lower() for x in self.registry_names] try: index = names.index(service) except ValueError: self.logging.error("The service you gave (%s) is not registered. See self.registery_names" % service) raise ValueError # get the base url according to the service requested resturl = self.registry_resturls[index] if firstResult is not None: params['firstResult'] = firstResult if maxResults is not None: params['maxResults'] = maxResults url = resturl + 'query/' + query if "xml" in output: res = self.http_get(url, frmt="xml", params=params) else: res = self.http_get(url, frmt="txt", params=params) res = res.strip().split("\n") if output.startswith("tab"): res = self._convert_tab2dict(res) return res def _convert_tab2dict(self, data): """ https://code.google.com/p/psicquic/wiki/MITAB26Format """ results = [] for line in data: results.append(line.split("\t")) return results def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None): """Same as query but runs on all active database :param list databases: database to query. Queries all active DB if not provided :return: dictionary where keys correspond to databases and values to the output of the query. :: res = s.queryAll("ZAP70 AND species:9606") """ results = {} if databases is None: databases = [x.lower() for x in self.activeDBs] for x in databases: if x not in self.activeDBs: raise ValueError("database %s not in active databases" % x) for name in databases: self.logging.warning("Querying %s" % name), res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults) if output.startswith("tab25"): results[name] = [x for x in res if x!=[""]] else: import copy results[name] = copy.copy(res) for name in databases: self.logging.info("Found %s in %s" % (len(results[name]), name)) return results def getInteractionCounter(self, query): """Returns a dictionary with database as key and results as values :param str query: a valid query :return: a dictionary which key as database and value as number of entries Consider only the active database. """ # get the active names only activeDBs = self.activeDBs[:] res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs] return dict(res) def getName(self, data): idsA = [x[0] for x in data] idsB = [x[1] for x in data] return idsA, idsB def knownName(self, data): """Scan all entries (MITAB) and returns simplified version Each item in the input list of mitab entry The output is made of 2 lists corresponding to interactor A and B found in the mitab entries. elements in the input list takes the following forms:: DB1:ID1|DB2:ID2 DB3:ID3 The | sign separates equivalent IDs from different databases. We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known database is found, then we keep the first one whatsover. known databases are those available in the uniprot mapping tools. chembl and chebi IDs are kept unchanged. """ self.logging.info("converting data into known names") idsA = [x[0].replace("\"","") for x in data] idsB = [x[1].replace("\"", "") for x in data] # extract the first and second ID but let us check if it is part of a # known uniprot mapping.Otherwise no conversion will be possible. # If so, we set the ID to "unknown" # remove the " character that can be found in a few cases (e.g, # chebi:"CHEBI:29036") #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA] #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB] # special case: # in mint, there is an entry that ends with a | uniprotkb:P17844| idsA = [x.strip("|") for x in idsA] idsB = [x.strip("|") for x in idsB] # the first ID for i, entry in enumerate(idsA): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs)>=1: idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug("none of the DB for this entry (%s) are available" % (entry)) idsA[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsA[i] = "??:" + entry # we add a : so that we are sure that a split(":") will work # the second ID for i, entry in enumerate(idsB): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs)>=1: idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug("none of the DB (%s) for this entry are available" % (entry)) idsB[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsB[i] = "??:" + entry countA = len([x for x in idsA if x.startswith("?")]) countB = len([x for x in idsB if x.startswith("?")]) if countA+countB > 0: self.logging.warning("%s ids out of %s were not identified" % (countA+countB, len(idsA)*2)) print(set([x.split(":")[0] for x in idsA if x.startswith("?")])) print(set([x.split(":")[0] for x in idsB if x.startswith("?")])) self.logging.info("knownName done") return idsA, idsB def preCleaning(self, data): """remove entries ehre IdA or IdB is set to "-" """ ret = [x for x in data if x[0] !="-" and x[1]!="-"] return ret def postCleaningAll(self,data, keep_only="HUMAN", flatten=True, verbose=True): """ even more cleaing by ignoring score, db and interaction len(set([(x[0],x[1]) for x in retnew])) """ results = {} for k in data.keys(): self.logging.info("Post cleaning %s" % k) ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose) if len(ret): results[k] = ret if flatten: results = [x for k in results.keys() for x in results[k]] return results def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi","chembl"], keep_self_loop=False, verbose=True): """Remove entries with a None and keep only those with the keep pattern """ if verbose:print("Before removing anything: ", len(data)) data = [x for x in data if x[0] is not None and x[1] is not None] if verbose:print("After removing the None: ", len(data)) data = [x for x in data if x[0].startswith("!")is False and x[1].startswith("!")is False] if verbose:print("After removing the !: ", len(data)) for db in remove_db: data = [x for x in data if x[0].startswith(db)is False] data = [x for x in data if x[1].startswith(db)is False] if verbose:print("After removing entries that match %s : " % db, len(data)) data = [x for x in data if keep_only in x[0] and keep_only in x[1]] if verbose:print("After removing entries that don't match %s : " % keep_only, len(data)) if keep_self_loop is False: data = [x for x in data if x[0]!=x[1]] if verbose:print("After removing self loop : ", len(data)) data = list(set(data)) if verbose:print("After removing identical entries", len(data)) return data def convertAll(self, data): results = {} for k in data.keys(): self.logging.info("Analysing %s" % k) results[k] = self.convert(data[k], db=k) return results def convert(self, data, db=None): self.logging.debug("converting the database %s" % db) idsA, idsB = self.knownName(data) mapping = self.mappingOneDB(data) results = [] for i, entry in enumerate(data): x = idsA[i].split(":",1)[1] y = idsB[i].split(":",1)[1] xp = mapping[x] yp = mapping[y] try:ref = entry[8] except:ref="?" try:score = entry[14] except:score = "?" try:interaction = entry[11] except:interaction="?" results.append((xp, yp, score, interaction, ref, db)) return results def mappingOneDB(self, data): query = {} self.logging.debug("converting IDs with proper DB name (knownName function)") entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id # the db is known from _mapping.uniprot otherwise it is called "unknown" # get unique DBs to build the query dictionary dbsA = [x.split(":")[0] for x in entriesA] dbsB = [x.split(":")[0] for x in entriesB] for x in set(dbsA): query[x] = set() for x in set(dbsB): query[x] = set() for k in query.keys(): if k.startswith("?"): del query[k] # the data to store mapping = {} N = len(data) # scan all entries counter = 0 for entryA, entryB in zip(entriesA, entriesB): counter += 1 dbA, idA = entryA.split(":") try: dbB, idB = entryB.split(":") except: print(entryB) if idA not in mapping.keys(): if dbA.startswith("?"): mapping[idA] = entryA else: query[dbA].add(idA) if idB not in mapping.keys(): if dbB.startswith("?"): mapping[idB] = entryB else: query[dbB].add(idB) for k in query.keys(): if len(query[k])>2000 or counter == N: this_query = list(query[k]) DBname = self._mapping_uniprot[k] if DBname is not None: self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N)) res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query)) for x in this_query: if x not in res: #was not found mapping[x] = "!" + k+":"+x else: # we should be here since the queries are populated # if not already in the mapping dictionary if x not in res.keys(): raise ValueError(x) if len(res[x])==1: mapping[x] = res[x][0] else: self.logging.warning("psicquic mapping found more than 1 id. keep first one") mapping[x] = res[x][0] else: for x in this_query: mapping[x] = k + ":" + x query[k] = set() for k in query.keys(): assert len(query[k])==0 return mapping
class PSICQUIC(RESTService): """Interface to the `PSICQUIC <http://code.google.com/p/psicquic/>`_ service There are 2 interfaces to the PSICQUIC service (REST and WSDL) but we used the REST only. This service provides a common interface to more than 25 other services related to protein. So, we won't detail all the possiblity of this service. Here is an example that consists of looking for interactors of the protein ZAP70 within the IntAct database:: >>> from bioservices import * >>> s = PSICQUIC() >>> res = s.query("intact", "zap70") >>> len(res) # there are 11 interactions found 11 >>> # Let us look at the second one in particular: >>> for x in res[1].split("\t"): ... print x uniprotkb:O95169 uniprotkb:P43403 intact:EBI-716238 intact:EBI-1211276 psi-mi:ndub8_human(display_long)|uniprotkb:NADH-ubiquinone oxidoreductase ASHI . . Here we have a list of entries. There are 15 of them (depending on the *output* parameter). The meaning of the entries is described on PSICQUIC website: https://code.google.com/p/psicquic/wiki/MITAB25Format . In short: #. Unique identifier for interactor A #. Unique identifier for interactor B. #. Alternative identifier for interactor A, for example the official gene #. Alternative identifier for interactor B. #. Aliases for A, separated by "| #. Aliases for B. #. Interaction detection methods, taken from the corresponding PSI-MI #. First author surname(s) of the publication(s) #. Identifier of the publication #. NCBI Taxonomy identifier for interactor A. #. NCBI Taxonomy identifier for interactor B. #. Interaction types, #. Source databases and identifiers, #. Interaction identifier(s) i #. Confidence score. Denoted as scoreType:value. Another example with reactome database:: res = s.query("reactome", "Q9Y266") .. warning:: PSICQUIC gives access to 25 other services. We cannot create a dedicated parsing for all of them. So, the ::`query` method returns the raw data. Addition class may provide dedicated parsing in the future. .. seealso:: :class:`bioservices.biogrid.BioGRID` """ _formats = ["tab25", "tab26", "tab27", "xml25", "count", "biopax", "xgmml", "rdf-xml", "rdf-xml-abbrev", "rdf-n3", "rdf-turtle"] # note the typo in "genbank indentifier from bind DB _mapping_uniprot = {"genbank indentifier": "P_GI", 'entrezgene/locuslink':"P_ENTREZGENEID", 'uniprotkb': "ACC+ID", 'rcsb pdb':"PDB_ID", 'ensembl':"ENSEMBL_ID", 'refseq':"P_REFSEQ_AC", 'hgnc':'HGNC_ID', "kegg": "KEGG_ID", "entrez gene/locuslink": "P_ENTREZGENEID", "chembl": "CHEMBL_ID", "ddbj/embl/genbank": "EMBL_ID", "dip": "DIP_ID", "ensemblgenomes": "ENSEMBLGENOME_ID", "omim":"MIM_ID", "chebi": None, "chembl": None, # "intact": None } # unknown: hprd, omim, bind, bind complexid, mdl, def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: print informative messages .. doctest:: >>> from bioservices import PSICQUIC >>> s = PSICQUIC() """ urlStr = 'http://www.ebi.ac.uk/Tools/webservices/psicquic' super(PSICQUIC, self).__init__("PSICQUIC", verbose=verbose, url=urlStr) self._registry = None try: self.uniprot = UniProt(verbose=False) except: self.logging.warning("UniProt service could be be initialised") self.buffer = {} def _get_formats(self): return PSICQUIC._formats formats = property(_get_formats, doc="Returns the possible output formats") def _get_active_db(self): names = self.registry_names[:] actives = self.registry_actives[:] names = [x.lower() for x,y in zip(names, actives) if y=="true"] return names activeDBs = property(_get_active_db, doc="returns the active DBs only") def read_registry(self): """Reads and returns the active registry """ url = self.url + '/registry/registry?action=ACTIVE&format=txt' res = self.request(url, format='txt') return res.split() def print_status(self): """Prints the services that are available :return: Nothing The output is tabulated. The columns are: * names * active * count * version * rest URL * soap URL * rest example * restricted .. seealso:: If you want the data into lists, see all attributes starting with registry such as :meth:`registry_names` """ url = self.url + '/registry/registry?action=STATUS&format=xml' res = self.request(url) names = self.registry_names counts = self.registry_counts versions = self.registry_versions actives = self.registry_actives resturls = self.registry_resturls soapurls = self.registry_soapurls restexs = self.registry_restexamples restricted = self.registry_restricted N = len(names) indices = sorted(range(0,N), key=lambda k: names[k]) for i in range(0,N): print("%s\t %s\t %s\t %s\t %s %s %s %s\n" % (names[i], actives[i], counts[i], versions[i], resturls[i], soapurls[i], restexs[i], restricted[i])) # todo a property for the version of PISCQUIC def _get_registry(self): if self._registry == None: url = self.url + '/registry/registry?action=STATUS&format=xml' res = self.request(url, format="xml") self._registry = res return self._registry registry = property(_get_registry, doc="returns the registry of psicquic") def _get_registry_names(self): res = self.registry return [x.findAll('name')[0].text for x in res.findAll("service")] registry_names = property(_get_registry_names, doc="returns all services available (names)") def _get_registry_restricted(self): res = self.registry return [x.findAll('restricted')[0].text for x in res.findAll("service")] registry_restricted = property(_get_registry_restricted, doc="returns restricted status of services" ) def _get_registry_resturl(self): res = self.registry data = [x.findAll('resturl')[0].text for x in res.findAll("service")] return data registry_resturls = property(_get_registry_resturl, doc="returns URL of REST services") def _get_registry_restex(self): res = self.registry data = [x.findAll('restexample')[0].text for x in res.findAll("service")] return data registry_restexamples = property(_get_registry_restex, doc="retuns REST example for each service") def _get_registry_soapurl(self): res = self.registry return [x.findAll('soapurl')[0].text for x in res.findAll("service")] registry_soapurls = property(_get_registry_soapurl, doc="returns URL of WSDL service") def _get_registry_active(self): res = self.registry return [x.findAll('active')[0].text for x in res.findAll("service")] registry_actives = property(_get_registry_active, doc="returns active state of each service") def _get_registry_count(self): res = self.registry return [x.findAll('count')[0].text for x in res.findAll("service")] registry_counts = property(_get_registry_count, doc="returns number of entries in each service") def _get_registry_version(self): res = self.registry names = [x.findAll('name')[0].text for x in res.findAll("service")] N = len(names) version = [0] * N for i in range(0,N): x = res.findAll("service")[i] if x.findAll("version"): version[i] = x.findAll("version")[0].text else: version[i] = None return version registry_versions = property(_get_registry_version, doc="returns version of each service") def query(self, service, query, output="tab25", version="current", firstResult=None, maxResults=None): """Send a query to a specific database :param str service: a registered service. See :attr:`registry_names`. :param str query: a valid query. Can be `*` or a protein name. :param str output: a valid format. See s._formats :: s.query("intact", "brca2", "tab27") s.query("intact", "zap70", "xml25") s.query("matrixdb", "*", "xml25") This is the programmatic approach to this website: http://www.ebi.ac.uk/Tools/webservices/psicquic/view/main.xhtml Another example consist in accessing the *string* database for fetching protein-protein interaction data of a particular model organism. Here we restrict the query to 100 results:: s.query("string", "species:10090", firstResult=0, maxResults=100, output="tab25") # spaces are automatically converted s.query("biogrid", "ZAP70 AND species:9606") .. warning:: AND must be in big caps. Some database are ore permissive than other (e.g., intact accepts "and"). species must be a valid ID number. Again, some DB are more permissive and may accept the name (e.g., human) To obtain the number of interactions in intact for the human specy:: >>> len(p.query("intact", "species:9606")) """ if service not in self.activeDBs: raise ValueError("database %s not in active databases" % service) params = {} if output!=None: self.checkParam(output, self.formats) params['format'] = output else: output="none" names = [x.lower() for x in self.registry_names] try: index = names.index(service) except ValueError: print("The service you gave (%s) is not registered. See self.registery_names" % service) raise ValueError # get the base url according to the service requested resturl = self.registry_resturls[index] if firstResult != None: params['firstResult'] = firstResult if maxResults != None: params['maxResults'] = maxResults postData = self.urlencode(params) url = resturl + 'query/' + query.replace(" ", "%20") if params: url += "?" + postData if "xml" in output: res = self.request(url, format="xml", baseUrl=False) else: res = self.request(url, format="txt",baseUrl=False) res = res.strip().split("\n") if output.startswith("tab"): res = self._convert_tab2dict(res) return res def _convert_tab2dict(self, data): """ https://code.google.com/p/psicquic/wiki/MITAB26Format """ results = [] for line in data: results.append(line.split("\t")) return results def queryAll(self, query, databases=None, output="tab25", version="current", firstResult=None, maxResults=None): """Same as query but runs on all active database :param list databases: database to query. Queries all active DB if not provided :return: dictionary where keys correspond to databases and values to the output of the query. :: res = s.queryAll("ZAP70 AND species:9606") """ results = {} if databases == None: databases = [x.lower() for x in self.activeDBs] for x in databases: if x not in self.activeDBs: raise ValueError("database %s not in active databases" % x) for name in databases: self.logging.warning("Querying %s" % name), res = self.query(name, query, output=output, version=version, firstResult=firstResult, maxResults=maxResults) if output.startswith("tab25"): results[name] = [x for x in res if x!=[""]] else: import copy results[name] = copy.copy(res) for name in databases: self.logging.info("Found %s in %s" % (len(results[name]), name)) return results def getInteractionCounter(self, query): """Returns a dictionary with database as key and results as values :param str query: a valid query :return: a dictionary which key as database and value as number of entries Consider only the active database. """ # get the active names only activeDBs = self.activeDBs[:] res = [(str(name), int(self.query(name, query, output="count")[0])) for name in activeDBs] return dict(res) def getName(self, data): idsA = [x[0] for x in data] idsB = [x[1] for x in data] return idsA, idsB def knownName(self, data): """Scan all entries (MITAB) and returns simplified version Each item in the input list of mitab entry The output is made of 2 lists corresponding to interactor A and B found in the mitab entries. elements in the input list takes the following forms:: DB1:ID1|DB2:ID2 DB3:ID3 The | sign separates equivalent IDs from different databases. We want to keep only one. The first known databae is kept. If in the list of DB:ID pairs no known database is found, then we keep the first one whatsover. known databases are those available in the uniprot mapping tools. chembl and chebi IDs are kept unchanged. """ self.logging.info("converting data into known names") idsA = [x[0].replace("\"","") for x in data] idsB = [x[1].replace("\"", "") for x in data] # extract the first and second ID but let us check if it is part of a # known uniprot mapping.Otherwise no conversion will be possible. # If so, we set the ID to "unknown" # remove the " character that can be found in a few cases (e.g, # chebi:"CHEBI:29036") #idsA = [x.replace("chebi:CHEBI:","chebi:") for x in idsA] #idsB = [x.replace("chebi:CHEBI:", "chebi:") for x in idsB] # special case: # in mint, there is an entry that ends with a | uniprotkb:P17844| idsA = [x.strip("|") for x in idsA] idsB = [x.strip("|") for x in idsB] # the first ID for i, entry in enumerate(idsA): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs)>=1: idsA[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug("none of the DB for this entry (%s) are available" % (entry)) idsA[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsA[i] = "??:" + entry # we add a : so that we are sure that a split(":") will work # the second ID for i, entry in enumerate(idsB): try: dbs = [x.split(":")[0] for x in entry.split("|")] IDs = [x.split(":")[1] for x in entry.split("|")] valid_dbs = [(db,ID) for db,ID in zip(dbs,IDs) if db in self._mapping_uniprot.keys()] # search for an existing DB if len(valid_dbs)>=1: idsB[i] = valid_dbs[0][0] + ":" + valid_dbs[0][1] else: self.logging.debug("none of the DB (%s) for this entry are available" % (entry)) idsB[i] = "?" + dbs[0] + ":" + IDs[0] except: self.logging.info("Could not extract name from %s" % entry) idsB[i] = "??:" + entry countA = len([x for x in idsA if x.startswith("?")]) countB = len([x for x in idsB if x.startswith("?")]) if countA+countB > 0: self.logging.warning("%s ids out of %s were not identified" % (countA+countB, len(idsA)*2)) print (set([x.split(":")[0] for x in idsA if x.startswith("?")])) print (set([x.split(":")[0] for x in idsB if x.startswith("?")])) self.logging.info("knownName done") return idsA, idsB def preCleaning(self, data): """remove entries ehre IdA or IdB is set to "-" """ ret = [x for x in data if x[0] !="-" and x[1]!="-"] return ret def postCleaningAll(self,data, keep_only="HUMAN", flatten=True, verbose=True): """ even more cleaing by ignoring score, db and interaction len(set([(x[0],x[1]) for x in retnew])) """ results = {} for k in data.keys(): self.logging.info("Post cleaning %s" % k) ret = self.postCleaning(data[k], keep_only="HUMAN", verbose=verbose) if len(ret): results[k] = ret if flatten: results = [x for k in results.keys() for x in results[k]] return results def postCleaning(self, data, keep_only="HUMAN", remove_db=["chebi","chembl"], keep_self_loop=False, verbose=True): """Remove entries with a None and keep only those with the keep pattern """ if verbose:print("Before removing anything: ", len(data)) data = [x for x in data if x[0]!=None and x[1]!=None] if verbose:print("After removing the None: ", len(data)) data = [x for x in data if x[0].startswith("!")==False and x[1].startswith("!")==False] if verbose:print("After removing the !: ", len(data)) for db in remove_db: data = [x for x in data if x[0].startswith(db)==False] data = [x for x in data if x[1].startswith(db)==False] if verbose:print("After removing entries that match %s : " % db, len(data)) data = [x for x in data if keep_only in x[0] and keep_only in x[1]] if verbose:print("After removing entries that don't match %s : " % keep_only, len(data)) if keep_self_loop == False: data = [x for x in data if x[0]!=x[1]] if verbose:print("After removing self loop : ", len(data)) data = list(set(data)) if verbose:print("After removing identical entries", len(data)) return data def convertAll(self, data): results = {} for k in data.keys(): self.logging.info("Analysing %s" % k) results[k] = self.convert(data[k], db=k) return results def convert(self, data, db=None): self.logging.debug("converting the database %s" % db) idsA, idsB = self.knownName(data) mapping = self.mappingOneDB(data) results = [] for i, entry in enumerate(data): x = idsA[i].split(":",1)[1] y = idsB[i].split(":",1)[1] xp = mapping[x] yp = mapping[y] try:ref = entry[8] except:ref="?" try:score = entry[14] except:score = "?" try:interaction = entry[11] except:interaction="?" results.append((xp, yp, score, interaction, ref, db)) return results def mappingOneDB(self, data): query = {} self.logging.debug("converting IDs with proper DB name (knownName function)") entriesA, entriesB = self.knownName(data) # idsA and B contains list of a single identifier of the form db:id # the db is known from _mapping.uniprot otherwise it is called "unknown" # get unique DBs to build the query dictionary dbsA = [x.split(":")[0] for x in entriesA] dbsB = [x.split(":")[0] for x in entriesB] for x in set(dbsA): query[x] = set() for x in set(dbsB): query[x] = set() for k in query.keys(): if k.startswith("?"): del query[k] # the data to store mapping = {} N = len(data) # scan all entries counter = 0 for entryA, entryB in zip(entriesA, entriesB): counter += 1 dbA, idA = entryA.split(":") try: dbB, idB = entryB.split(":") except: print entryB if idA not in mapping.keys(): if dbA.startswith("?"): mapping[idA] = entryA else: query[dbA].add(idA) if idB not in mapping.keys(): if dbB.startswith("?"): mapping[idB] = entryB else: query[dbB].add(idB) for k in query.keys(): if len(query[k])>2000 or counter == N: this_query = list(query[k]) DBname = self._mapping_uniprot[k] if DBname != None: self.logging.warning("Request sent to uniprot for %s database (%s/%s)" % (DBname, counter, N)) res = self.uniprot.mapping(fr=DBname, to="ID", query=" ".join(this_query)) for x in this_query: if x not in res: #was not found mapping[x] = "!" + k+":"+x else: # we should be here since the queries are populated # if not already in the mapping dictionary if x == mapping.keys(): raise ValueError(x) index = res.index(x) mapping[x] = res[index+1] else: for x in this_query: mapping[x] = k + ":" + x query[k] = set() for k in query.keys(): assert len(query[k])==0 return mapping
def main(): """ Main function.""" args = parse_args() if args.log: logfile = args.log logging.basicConfig(filename=logfile, level=logging.DEBUG, \ filemode='w', format='%(asctime)s %(message)s', \ datefmt='%Y-%m-%d %H:%M:%S') else: logfile = sys.stdout outputfile = open(args.out, "w") # Output header outputfile.write("chr\tpos\tid\tref\talt\tgene\tfeature\tfeature_type\tconsequence\tswissprotid\tuniprotid\tpdbid\tprotein_position\tamino_acid\n") vcf_row = {} #Interface to the UniProt service u = UniProt(verbose=False) vcf_reader = vcf.Reader(open(args.vcf, 'r')) ENSP_PDB_UNIPROT_mapping_DataFram = pd.DataFrame(columns=['ENSP','UniProtID','PDB']) #creating a util function to store mapping of Uniprot and PDB_ID for record in vcf_reader: # VEP fields curr_ENSP = '' if "CSQ" in record.INFO: csq = record.INFO['CSQ'] # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED # For going through annotations for all transcript for current_csq_element in csq: current_csq = current_csq_element.split('|') curr_ENSP = str(current_csq[26]) if curr_ENSP != "": # to get Protein ID given ENSP ID current_protein_list = u.search(curr_ENSP,frmt="list") for curr_protein in current_protein_list.split("\n"): if curr_protein != "": # to get PDB ID given protein id mapping_Dictionary = u.mapping(fr="ID", to="PDB_ID", query=str(curr_protein)) if bool(mapping_Dictionary) == True : if curr_ENSP not in ENSP_PDB_UNIPROT_mapping_DataFram.index: ENSP_PDB_UNIPROT_mapping_DataFram.loc[curr_ENSP] = pd.Series({'ENSP':curr_ENSP, 'UniProtID':mapping_Dictionary.keys(), 'PDB':mapping_Dictionary.values()}) #print(ENSP_PDB_UNIPROT_mapping_DataFram) # writing in a csv file for record in vcf_reader: current_chr = record.CHROM current_id = record.ID current_pos = record.POS current_ref = record.REF current_alt = ','.join(str(v) for v in record.ALT) # VEP fields current_gene, current_feature = '','' current_feature_type, current_consequence = '','' current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','','' if "CSQ" in record.INFO: csq = record.INFO['CSQ'] # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED # For going through annotations for all transcript for current_csq_element in csq: current_csq = current_csq_element.split('|') current_consequence = current_csq[1] current_gene = current_csq[4] current_feature_type = current_csq[5] current_feature = current_csq[6] current_protein_position = current_csq[14] current_amino_acid = current_csq[15] current_ENSP = current_csq[26] current_swissport = current_csq[27] # only cosider missense mutation #if current_swissport_in_my_list(current_swissport, swissprot_pdb_) if current_ENSP in ENSP_PDB_UNIPROT_mapping_DataFram.index: current_protein = ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['UniProtID'] for item in ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['PDB']: current_pdbid = item break; out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt, current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid] else: current_protein = "" current_pdbid = "" out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt, current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid] out_str = [x or 'None' for x in out_str] outputfile.write("\t".join(out_str)) outputfile.write("\n") else: current_gene, current_feature = '','' current_feature_type, current_consequence = '','' current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','','' out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt, current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid] out_str = [x or 'None' for x in out_str] outputfile.write("\t".join(out_str)) outputfile.write("\n") outputfile.close() logging.info('Start.') logging.info('Command line: {}'.format(' '.join(sys.argv)))
class Mapper(Logging): """Accepted code: uniprot m = Mapper() # HGNC df_hgnc = m.get_all_hgnc_into_df() df_hgnc.to_pickle("mapper_hgnc.dat") # KEGG df_kegg1 = m.get_all_kegg_into_df1() df_kegg2 = m.get_all_kegg_into_df2() uniq_keggid = """ kegg_dblinks = [ "IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega" ] hgnc_dblink = [ 'EC', 'Ensembl', 'EntrezGene', 'GDB', 'GENATLAS', 'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP', 'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF', 'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam', 'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase' ] def __init__(self, verbosity="INFO"): super(Mapper, self).__init__(level=verbosity) self.logging.info("Initialising the services") self.logging.info("... uniprots") self._uniprot_service = UniProt() self.logging.info("... KEGG") self._kegg_service = KeggParser(verbose=False) self.logging.info("... HGNC") self._hgnc_service = HGNC() self.logging.info("... UniChem") self._unichem_service = UniChem() self.logging.info("...BioDBNet") self._biodbnet = BioDBNet() def _uniprot2refseq(self, name): """ There are 2 refseq alias: REFSEQ_NT_ID and P_REFSEQ_AC. Here, we use the first one to agree with wikipedia http://en.wikipedia.org/wiki/Protein_Kinase_B """ return self._uniprot_service.mapping(fr="ACC", to="REFSEQ_NT_ID", query="P31749") def _update_uniprot_xref(self, df, xref=["HGNC_ID", "ENSEMBLE_ID", "P_ENTREZGENEID"]): """Update the dataframe using Uniprot to map indices onto cross reference databases """ for ref in xref: print("Processing %s " % ref) res = self._uniprot_service.multi_mapping("ACC", ref, list(df.index), timeout=10, ntrials=5) if "%s__uniprot_mapping" % ref not in df.columns: thisdf = pd.DataFrame({"%s__uniprot_mapping": res.values()}, index=res.keys()) df = df.join(thisdf) else: for index in df.index: if index in res.keys(): df.ix[index]["%s__uniprot_mapping" % ref] = res[index] def get_data_from_biodbnet(self, df_hgnc): """keys are unique Gene names input is made of the df based on HGNC data web services uniprot accession are duplicated sometimes. If som this is actually the iprimary accession entry and all secondary ones. e.g. , ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11 correspond actually to the primary one : Q8NFV4 """ b = biodbnet.BioDBNet() res2 = b.db2db("Gene Symbol", [ "HGNC ID", "UniProt Accession", "UniProt Entry Name", "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID" ], res.keys()[0:2000]) import pandas as pd import StringIO c = pd.read_csv(StringIO.StringIO(res2), delimiter="\t", index_col="Gene Symbol") return c
def get_more_node_ids(the_network, **kwargs): """ Script to add more identifiers to model notes based on the node.id Arguments: the_network: a Network object, modified in place kwargs: node_id_type: current type of ids used for the nodes. Currently can be Entrez Gene (GeneID) or any of the options in the BioServices UniProt mappings mapping_types: a list of target mapping id types to include verbose: Returns: the_network TODO: determine the best source db/module for pairings from bioservices """ continue_flag = True try: from bioservices import UniProt u = UniProt(verbose=False) except: print("No bioservices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False the_node_locations = the_network.get_node_locations() if len(the_node_locations) == 0: print 'The network has no nodes, exiting...' continue_flag = False if 'node_id_type' in kwargs: node_id_type = kwargs['node_id_type'] else: node_id_type = "Entrez Gene (GeneID)" if 'mapping_types' in kwargs: mapping_types = kwargs['mapping_types'] else: mapping_types = default_mapping_target_list if 'verbose' in kwargs: verbose = kwargs['verbose'] else: verbose = True # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: query_string = '' model_node_ids = [] for the_nodetype in the_network.nodetypes: model_node_ids += [x.id for x in the_nodetype.nodes] the_node_id_list_list = [[]] i = 0 j = 0 for the_node_id in model_node_ids: if (j + 1) % max_query_length == 0: the_node_id_list_list.append([]) i += 1 the_node_id_list_list[i] = [] j = 0 the_node_id_list_list[i].append(the_node_id) j += 1 query_string_list = [] for i, the_node_id_list in enumerate(the_node_id_list_list): query_string = '' for the_node_id in the_node_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_node_id else: query_string = the_node_id query_string_list.append(query_string) for the_target_type in mapping_types: the_result = {} for the_query_string in query_string_list: the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type)) for the_nodetype in the_network.nodetypes: for the_node in the_nodetype.nodes: if (the_node.id in the_result.keys()): if len(the_result[the_node.id]) > 0: the_node.notes[the_target_type] = the_result[the_node.id] else: the_node.notes[the_target_type] = [] else: the_node.notes[the_target_type] = [] return the_network
def get_more_node_ids(the_network, **kwargs): """ Script to add more identifiers to model notes based on the node.id Arguments: the_network: a Network object, modified in place kwargs: node_id_type: current type of ids used for the nodes. Currently can be 'Entrez Gene (GeneID)' or any of the options in the BioServices UniProt mappings mapping_types: a list of target mapping id types to include. Options can be viewed in core.parameters.py Note "Symbol" is an additional option for the officieal gene nomenclature symbol. email: optional, for NCBI queries. verbose: [True (default), False] Returns: the_network TODO: determine the best source db/module for pairings from bioservices """ continue_flag = True valid_mapping_targets = available_mapping_target.keys() + ['Symbol'] verbose = test_kwarg('verbose', kwargs, [True, False]) try: from bioservices import UniProt # Don't want verbosity at this low of a level u = UniProt(verbose = False) except: print("No bioservices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False the_node_locations = the_network.get_node_locations() if len(the_node_locations) == 0: print 'The network has no nodes, exiting...' continue_flag = False if 'node_id_type' in kwargs: node_id_type = kwargs['node_id_type'] if node_id_type == 'Symbol': print "'Symbol' is a special case, not yet able to query with this option, exiting..." continue_flag = False else: print "No node id type specified, attempting to use 'Entrez Gene (GeneID)'" node_id_type = 'Entrez Gene (GeneID)' if 'mapping_types' in kwargs: mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets] if len(mapping_types) == 0: print('No valid mapping_types selected, exiting...') continue_flag = False elif 'Symbol' in mapping_types: if (('Entrez Gene (GeneID)' not in mapping_types) & (node_id_type != 'Entrez Gene (GeneID)')): print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..." continue_flag = False else: mapping_types = default_mapping_target_list if 'email' in kwargs: email = kwargs['email'] else: email = '' # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: query_string = '' model_node_ids = [] model_nodes = [] for the_nodetype in the_network.nodetypes: model_nodes += [x for x in the_nodetype.nodes] the_node_id_list_list = [[]] i = 0 j = 0 for the_node in model_nodes: if (j + 1) % max_query_length == 0: the_node_id_list_list.append([]) i += 1 the_node_id_list_list[i] = [] j = 0 the_node_id_list_list[i].append(the_node.id) j += 1 query_string_list = [] for i, the_node_id_list in enumerate(the_node_id_list_list): query_string = '' for the_node_id in the_node_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_node_id else: query_string = the_node_id query_string_list.append(query_string) for the_target_type in mapping_types: if the_target_type != 'Symbol': the_result = {} for the_query_string in query_string_list: the_result.update(u.mapping(fr = available_mapping_source[node_id_type], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("**Finished mapping for %s to %s.**" % (node_id_type, the_target_type)) for the_node in model_nodes: if (the_node.id in the_result.keys()): if len(the_result[the_node.id]) > 0: the_node.notes[the_target_type] = the_result[the_node.id] else: the_node.notes[the_target_type] = [] else: the_node.notes[the_target_type] = [] # To avoid a loss of information, we should also make # sure queried IDs are returned in the appropriate # field in case they weren't available in the database. if node_id_type in mapping_types: # Not yet supported anyway, but can leave this here. if node_id_type != 'Symbol': for the_node in model_nodes: if the_node.id not in the_node.notes[node_id_type]: the_node.notes[node_id_type].append(the_node.id) if "Symbol" in mapping_types: if ((node_id_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)): the_entrez_to_query = [] query_dict = {} for the_node in model_nodes: query_dict[the_node.id] = {} query_dict[the_node.id]["Entrez Gene (GeneID)"] = [] if node_id_type == "Entrez Gene (GeneID)": query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_node.id) if "Entrez Gene (GeneID)" in mapping_types: the_entrez_list = the_node.notes["Entrez Gene (GeneID)"] if len(the_entrez_list) > 0: for the_entrez_id in the_entrez_list: if the_entrez_id not in query_dict[the_node.id]["Entrez Gene (GeneID)"]: query_dict[the_node.id]["Entrez Gene (GeneID)"].append(the_entrez_id) the_entrez_to_query += query_dict[the_node.id]["Entrez Gene (GeneID)"] the_entrez_to_query = list(set(the_entrez_to_query)) the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose) for the_node in model_nodes: the_node.notes["Symbol"] = [] for the_entrez_id in query_dict[the_node.id]["Entrez Gene (GeneID)"]: the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol'] if len(the_symbol_id) > 0: the_node.notes["Symbol"].append(the_symbol_id) print("**Finished mapping for %s to %s.**" % (node_id_type, "Symbol")) elif verbose: print "'Entrez Gene (GeneID)' mappings are needed first in order to query symbols, skipping..." return the_network
def convert_gene_ids_bt(xml_file_in, id_identity = None, id_formatter = None, translate_file = None): """ Replace all found instances of old gene IDs to new IDs. N.B. It will only look at 'GENE ASSOCIATION' lines. 'translate_file' should be 2 column tsv file. """ ## Create ID conversion dictionary for MetaCyc translate_file = translate_file or '/Users/wbryant/Dropbox/Bacteroides/BioCyc_-_Protein-Gene-relations/BioCyc_BT_-_Protein-Gene-relations.txt' trans_in = open(translate_file,'r') id_dict = {} for line in trans_in: ids = line.split("\t") if len(ids[1]) > 0: id_dict[ids[0]] = ids[1].strip() id_identity = id_identity or model_metacyc_identifier id_formatter = id_formatter or model_metacyc_gene_2_biocyc ## Create gene -> locus dictionary from NCBI file ncbi_gene_file = '/Users/wbryant/work/BTH/data/NCBI/gene_list.dat' ncbi_in = open(ncbi_gene_file,'r') ncbi_id_dict = {} for line in ncbi_in: if re.search('[0-9]+\.[ ].+',line): ncbi_id = line.strip().split(" ")[-1] elif 'Other Aliases' in line: bt_ids = re.findall('BT\_[0-9]+',line) for bt_id in bt_ids: ncbi_id_dict[ncbi_id] = bt_id ncbi_in.close() ## Some specific UniProt IDs do not map - so put them here manually: uniprot_manual_dict = {} uniprot_manual_dict['Q8A1G3_BACTN'] = 'BT_3698' uniprot_manual_dict['G8JZS4_BACTN'] = 'BT_3703' uniprot_manual_dict['Q8A1G0_BACTN'] = 'BT_3704' uniprot_manual_dict['Q89YR9_BACTN'] = 'BT_4662' ### Run through lines of input file replacing relevant gene IDs with new gene IDs u = UniProt(verbose=False) xml_file_out = re.sub('\.xml','_out.xml',xml_file_in) f_in = open(xml_file_in,'r') f_out = open(xml_file_out,'w') for line in f_in: if 'GENE ASSOCIATION' in line: ## Look for genes fitting id_identity, convert and replace #print line ###! Change! line = re.sub('(\<[^\>]+\>[ \n]*$)',' \g<1>',line) old_ids = re.findall(id_identity, line) if len(old_ids) > 0: #print old_ids.groups(1) for old_id in old_ids: old_id_formatted = id_formatter(old_id) try: new_id = id_dict[old_id_formatted] except: new_id = old_id_formatted print("ID '%s' not found ..." % new_id) #print("%20s: %20s" % (old_id, new_id)) line = line.replace(old_id,new_id,1) ## Remove extraneous gene surrounds line = re.sub('\(gene\:([^\)]+)_i\)','\g<1>',line) ## Look for UniProt genes and convert if 'uniprot' in line: uniprot_entries = re.findall('\(uniprot\:[^\)]+\)',line) for uniprot_entry in uniprot_entries: ## Map IDs uniprot_id = re.sub('\(uniprot\:([^\)]+)\)','\g<1>',uniprot_entry) try: new_entry = u.mapping(fr='ACC',to='KEGG_ID',query=uniprot_id)[uniprot_id][0] except: print("Protein ID '%s' not found in mapping, trying local ..." % uniprot_id) try: new_entry = uniprot_manual_dict[uniprot_id] except: print("Protein ID '%s' not found in local ..." % uniprot_id) new_entry = uniprot_id new_id = re.sub('bth\:([^\)]+)','\g<1>',new_entry) line = line.replace(uniprot_entry,new_id,1) #u.mapping(fr='BIOCYC_ID',to='KEGG_ID',query='GJXV-2505') ## Get gene string line_groups = re.search('(.+GENE ASSOCIATION\:[ ]*)(.+)([ ]*\<.+)',line) gene_string = line_groups.group(2) if '_BACTN' in gene_string: print gene_string ## Look for NCBI IDs (like susG) and replace with BT IDs potential_ncbis = re.findall('[a-zA-Z0-9\_]+',gene_string) if '_BACTN' in gene_string: print ", ".join(potential_ncbis) for potential_ncbi in potential_ncbis: if potential_ncbi in ncbi_id_dict: new_id = ncbi_id_dict[potential_ncbi] gene_string = gene_string.replace(potential_ncbi,new_id,1) elif potential_ncbi in uniprot_manual_dict: new_id = uniprot_manual_dict[potential_ncbi] gene_string = gene_string.replace(potential_ncbi,new_id,1) ##Remove duplicates gene_list = gene_string.split(" or ") gene_list = list(set(gene_list)) ## Reconstitute line line = line_groups.group(1) line += " or ".join(gene_list) line += line_groups.group(3) f_out.write(line) else: f_out.write(line) f_out.close()
def batch_map(accessions, fr='ACC+ID', allow_download=False, cache=False, session=None, keep_unreviewed=True, match_taxon_id=9606, verbose=False): """ Map a list of accessions using the UniProt batch mapping service. Parameters ---------- accessions : list List of accessions. fr : str, optional Database to map from. See :class:`bioservices.UniProt`. keep_unreviewed : bool, optional If True, keep the unreviewed accession in mapping. allow_download : bool, optional If True, will download records that are missing for any accession in `accessions`. cache : bool, optional If True, `bioservices` cache will be used by :class:`bioservices.UniProt`. Set to `False` to use the most up-to-date mappings. session : `scoped_session`, optional Session instance to save protein instances to if `allow_download` is True. match_taxon_id : int, optional Ignores mappings to or from proteins that do not match this id. verbose : bool, optional Log info/warning/error messages to the console. Returns ------- `dict` A dictionary of mappings from UniProt accessions to the most up-to-date UniProt accessions. Dictionary values are lists. """ uniprot_mapper = UniProtMapper(cache=cache) filtered_mapping = {} mapping = uniprot_mapper.mapping(fr=fr, to='ACC', query=accessions) # No data was downloaded, try again a few times. if mapping == {}: for i in range(0, 4): mapping = uniprot_mapper.mapping(fr=fr, to='ACC', query=accessions) if mapping: break else: if verbose: logger.warning( "Could not download map from uniprot server. " "Attempt {}/5. Re-attempt in 3 seconds.".format(i + 2)) time.sleep(3) if mapping == {}: raise ValueError("Could not download map from uniprot server.") for fr, to in mapping.items(): # Make sure any new accessions are in the database invalid_to = [] for accession in to: # Check to see if a protein macthing accession and the # taxon id exists. entry = Protein.get_by_uniprot_id(accession) if entry is not None: if (match_taxon_id is not None) and entry.taxon_id != match_taxon_id: invalid_to.append(accession) else: if allow_download: if verbose: logger.info( "Mapping to {}, but entry not found in database. " "Attempting download.".format(accession)) record = download_record(accession, verbose=True, taxon_id=match_taxon_id) protein = parse_record_into_protein(record) if protein is not None: protein.save(session, commit=True) else: if verbose: logger.info( "No valid record for {} was found".format( accession)) invalid_to.append(accession) else: invalid_to.append(accession) to = [a for a in to if a not in invalid_to] status = [Protein.get_by_uniprot_id(a).reviewed for a in to] reviewed = [a for (a, s) in zip(to, status) if s is True] unreviewed = [a for (a, s) in zip(to, status) if s is False] targets = reviewed if keep_unreviewed: targets += unreviewed targets = list(set(targets)) if not (match_taxon_id is None): taxon_ids = [ Protein.get_by_uniprot_id(a).taxon_id for a in targets ] targets = [ t for (t, taxon_id) in zip(targets, taxon_ids) if match_taxon_id == taxon_id ] filtered_mapping[fr] = list(sorted(targets)) return filtered_mapping
def keggid_to_uniprot(interactions, verbose=False, trembl=False, cache=False): """ Map KEGG_ID accessions into uniprot. Performs a product operation to product multiple new interactions from a single interaction if multiple possible mappings are found. Parameters ---------- interactions : :class:`pd.DataFrame` DataFrame with 'source', 'target', 'label', 'pubmed', and 'experiment_type' columns. trembl : bool, optional, default: False If True, during the mapping process, keeps mapped rows containing TrEMBL accessions in either `source` or `target`. Otherwise, these rows are deleted. verbose : bool, optional, default: False If True, logs messages regarding mapping warnings and other information. cache : bool, optional, default: False If True, HTTP responses are cached by `bioservices`. This can save time but you will eventually miss out on new database releases if your cache is old. Returns ------- `pd.DataFrame` DataFrame with 'source', 'target', 'label', 'pubmed', and 'experiment_type' columns. """ filtered_map = {} sources = [a for a in interactions.source.values] targets = [b for b in interactions.target.values] unique_ids = list(set(sources) | set(targets)) mapper = UniProt(cache=cache) mapping = mapper.mapping(fr='KEGG_ID', to='ACC', query=unique_ids) for kegg_id, uniprot_ls in mapping.items(): # Check that the accessions are actually in the database. # If not, ignore them and warn the user. proteins_all = [Protein.get_by_uniprot_id(a) for a in uniprot_ls] proteins_valid = [] zipped = list(zip(proteins_all, uniprot_ls)) for p, accession in zipped: if p is None: uniprot_ls.remove(accession) if verbose: logger.warning( "No protein for '{}' found in the database. Consider " "downloading the latest UniProt dat files and " "updating the database.".format(accession) ) else: proteins_valid.append(p) # Only process the proteins in the database. status_ls = [p.reviewed for p in proteins_valid] status_ls = list(zip(uniprot_ls, status_ls)) reviewed = [a for (a, s) in status_ls if s is True] unreviewed = [a for (a, s) in status_ls if s is False] if len(reviewed) > 0: if len(reviewed) > 1: if verbose: logger.warning( 'More that one reviewed ' 'acc found for {}: {}'.format(kegg_id, reviewed) ) filtered_map[kegg_id] = reviewed else: if verbose: logger.warning( 'No reviewed acc found for {}.'.format(kegg_id) ) if trembl and len(unreviewed) > 0: if len(reviewed) > 1: if verbose: logger.warning( 'More that one unreviewed ' 'acc found for {}: {}'.format(kegg_id, unreviewed) ) filtered_map[kegg_id] = unreviewed else: if verbose: logger.warning('Could not map {}.'.format(kegg_id)) # Remaining kegg_ids that have not mapped to anything go to None zipped = list( zip( interactions[SOURCE].values, interactions[TARGET].values, interactions[LABEL].values, interactions[PUBMED].values, interactions[EXPERIMENT_TYPE].values ) ) sources = [] targets = [] labels = [] pmids = [] psimis = [] for source, target, label, pmid, psimi in zipped: source_acc = filtered_map.get(source, []) target_acc = filtered_map.get(target, []) # Some Kegg_Ids genuinely map to more than 1 distinct uniprot # accession, so we use a list product to account for this. ppis = product(source_acc, target_acc) for (s, t) in ppis: sources.append(s) targets.append(t) labels.append(label) pmids.append(pmid) psimis.append(psimi) interactions = make_interaction_frame( sources, targets, labels, pmids, psimis ) return interactions
def get_more_source_dict_ids(source_dict, primary_key, **kwargs): """ Script to add more ids to source dict nodes to facilitate pairing to a network Arguments: source_dict: id_key: value primary_key: current type of ids used for the nodes. Currently can be 'Entrez Gene (GeneID)' or any of the options in the BioServices UniProt mappings. kwargs: mapping_types: a list of mapping types to include verbose Returns: source_dict, also modified in place """ continue_flag = True file_key = primary_key if primary_key not in available_mapping_source.keys(): continue_flag = False print "Error, you must specify a valid primary_key descriptor to match to in the available database, exiting..." if 'mapping_types' in kwargs: mapping_types = kwargs['mapping_types'] else: mapping_types = default_mapping_target_list try: from bioservices import UniProt u = UniProt(verbose=False) except: print("No bioservices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False if 'node_id_type' in kwargs: node_id_type = kwargs['node_id_type'] else: node_id_type = "Entrez Gene (GeneID)" if 'verbose' in kwargs: verbose = kwargs['verbose'] else: verbose = True # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: the_query_id_list_list = [[]] i = 0 j = 0 for the_query_id in source_dict.keys(): if (j + 1) % max_query_length == 0: the_query_id_list_list.append([]) i += 1 the_query_id_list_list[i] = [] j = 0 the_query_id_list_list[i].append(the_query_id) j += 1 the_query_string_list = [] for i, the_query_id_list in enumerate(the_query_id_list_list): query_string = '' for the_query_id in the_query_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_query_id else: query_string = the_query_id the_query_string_list.append(query_string) for the_key in source_dict.keys(): if type(source_dict[the_key]) != dict: the_value = source_dict[the_key] source_dict[the_key] = {} source_dict[the_key]['value'] = the_value for the_target_type in mapping_types: the_result = {} for the_query_string in the_query_string_list: the_result.update(u.mapping(fr = available_mapping_source[file_key], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("** Finished mapping for %s to %s. **" % (file_key, the_target_type)) for the_query_id in source_dict.keys(): if the_query_id in the_result.keys(): if len(the_result[the_query_id]) > 0: source_dict[the_query_id][the_target_type] = the_result[the_query_id] else: source_dict[the_query_id][the_target_type] = [] else: source_dict[the_query_id][the_target_type] = [] return source_dict
class Mapper(Logging): """Accepted code: uniprot m = Mapper() # HGNC df_hgnc = m.get_all_hgnc_into_df() df_hgnc.to_pickle("mapper_hgnc.dat") # KEGG df_kegg1 = m.get_all_kegg_into_df1() df_kegg2 = m.get_all_kegg_into_df2() uniq_keggid = """ kegg_dblinks = ["IMGT", "Ensembl", "HGNC", "HPRD", "NCBI-GI", "OMIM", "NCBI-GeneID", "UniProt", "Vega"] hgnc_dblink = ['EC','Ensembl', 'EntrezGene', 'GDB', 'GENATLAS', 'GeneCards', 'GeneTests', 'GoPubmed', 'H-InvDB', 'HCDM', 'HCOP', 'HGNC', 'HORDE', 'IMGT_GENE_DB', 'INTERFIL', 'IUPHAR', 'KZNF', 'MEROPS', 'Nucleotide', 'OMIM', 'PubMed', 'RefSeq', 'Rfam', 'Treefam', 'UniProt', 'Vega', 'miRNA', 'snoRNABase'] def __init__(self, verbosity="INFO"): super(Mapper, self).__init__(level=verbosity) self.logging.info("Initialising the services") self.logging.info("... uniprots") self._uniprot_service = UniProt() self.logging.info("... KEGG") self._kegg_service = KeggParser(verbose=False) self.logging.info("... HGNC") self._hgnc_service = HGNC() self.logging.info("... UniChem") self._unichem_service = UniChem() self.logging.info("...BioDBNet") self._biodbnet = BioDBNet() def _uniprot2refseq(self, name): """ There are 2 refseq alias: REFSEQ_NT_ID and P_REFSEQ_AC. Here, we use the first one to agree with wikipedia http://en.wikipedia.org/wiki/Protein_Kinase_B """ return self._uniprot_service.mapping(fr="ACC", to="REFSEQ_NT_ID", query="P31749") def _update_uniprot_xref(self, df, xref=["HGNC_ID", "ENSEMBLE_ID", "P_ENTREZGENEID"]): """Update the dataframe using Uniprot to map indices onto cross reference databases """ for ref in xref: print("Processing %s " % ref) res = self._uniprot_service.multi_mapping("ACC", ref, list(df.index), timeout=10, ntrials=5) if "%s__uniprot_mapping" % ref not in df.columns: thisdf = pd.DataFrame({"%s__uniprot_mapping": res.values()}, index=res.keys()) df = df.join(thisdf) else: for index in df.index: if index in res.keys(): df.ix[index]["%s__uniprot_mapping" % ref] = res[index] def get_data_from_biodbnet(self, df_hgnc): """keys are unique Gene names input is made of the df based on HGNC data web services uniprot accession are duplicated sometimes. If som this is actually the iprimary accession entry and all secondary ones. e.g. , ABHD11 >>>> Q8N723;Q8NFV2;Q8NFV3;Q6PJU0;Q8NFV4;H7BYM8;Q8N722;Q9HBS8 ABHDB_HUMAN Alpha/beta hydrolase domain-containing protein 11 correspond actually to the primary one : Q8NFV4 """ b = biodbnet.BioDBNet() res2 = b.db2db("Gene Symbol", ["HGNC ID", "UniProt Accession", "UniProt Entry Name", "UniProt Protein Name", "KEGG Gene ID", "Ensembl Gene ID"], res.keys()[0:2000]) import pandas as pd import StringIO c = pd.read_csv(StringIO.StringIO(res2), delimiter="\t", index_col="Gene Symbol") return c
def get_more_source_dict_ids(source_dict, primary_key_type, **kwargs): """ Script to add more ids to source dict nodes to facilitate pairing to a network Arguments: source_dict: id_key: value primary_key: current type of ids used for the top level dict key. Currently can be 'Entrez Gene (GeneID)' or any of the options in the BioServices UniProt mappings. kwargs: mapping_types: a list of mapping types to include. See core.parameters for the full list. Note 'Symbol' is a special case for querying that depends on Entrez ID availability. verbose: [False (default), True] email: optional, for NCBI if querying for 'Symbol' Returns: source_dict, also modified in place """ continue_flag = True verbose = test_kwarg('verbose', kwargs, [False, True]) valid_mapping_targets = available_mapping_target.keys() + ['Symbol'] if primary_key_type not in available_mapping_source.keys(): if primary_key_type == 'Symbol': print "'Symbol' is a special case, not yet able to query with this as a primary key." print "Error, you must specify a valid primary_key_type descriptor to match to in the available database, exiting..." continue_flag = False if 'mapping_types' in kwargs: mapping_types = [x for x in kwargs['mapping_types'] if x in valid_mapping_targets] if len(mapping_types) == 0: print('No valid mapping_types selected, exiting...') continue_flag = False elif 'Symbol' in mapping_types: if (('Entrez Gene (GeneID)' not in mapping_types) & (primary_key_type != 'Entrez Gene (GeneID)')): print "'Symbol' mapping type needs 'Entrez Gene (GeneID)', exiting..." continue_flag = False else: mapping_types = default_mapping_target_list if 'email' in kwargs: email = kwargs['email'] else: email = '' try: from bioservices import UniProt # Don't want verbosity at this low of a level u = UniProt(verbose = False) except ImportError: print("No BioServices module installed or cannot connect, exiting...") print("e.g. if you are using pip, did you 'pip install bioservices'?") continue_flag = False # Maximum number of items to # query at a time # Note there is a length limit in bioservices 1.2.1 # for the web-based query string. # Trial-and-error suggests the most # id's that can be queried are # between 100 and 1000 max_query_length = 500 if continue_flag: the_query_id_list_list = [[]] i = 0 j = 0 for the_query_id in source_dict.keys(): if (j + 1) % max_query_length == 0: the_query_id_list_list.append([]) i += 1 the_query_id_list_list[i] = [] j = 0 the_query_id_list_list[i].append(the_query_id) j += 1 the_query_string_list = [] for i, the_query_id_list in enumerate(the_query_id_list_list): query_string = '' for the_query_id in the_query_id_list: if len(query_string) > 0: query_string = query_string + ' ' + the_query_id else: query_string = the_query_id the_query_string_list.append(query_string) for the_key in source_dict.keys(): if type(source_dict[the_key]) != dict: the_value = source_dict[the_key] source_dict[the_key] = {} source_dict[the_key]['value'] = the_value for the_target_type in mapping_types: if the_target_type != 'Symbol': the_result = {} for the_query_string in the_query_string_list: the_result.update(u.mapping(fr = available_mapping_source[primary_key_type], to = available_mapping_target[the_target_type], query = the_query_string)) if verbose: print("** Finished mapping for %s to %s. **" % (primary_key_type, the_target_type)) for the_query_id in source_dict.keys(): if the_query_id in the_result.keys(): if len(the_result[the_query_id]) > 0: source_dict[the_query_id][the_target_type] = the_result[the_query_id] else: source_dict[the_query_id][the_target_type] = [] else: source_dict[the_query_id][the_target_type] = [] # To avoid a loss of information, we should also make # sure queried IDs are returned in the appropriate # field in case they weren't available in the database. if primary_key_type in mapping_types: # Not yet supported but we can check to avoid breaking this if primary_key_type != 'Symbol': for the_source_dict_id in source_dict.keys(): if the_source_dict_id not in source_dict[the_source_dict_id][primary_key_type]: source_dict[the_source_dict_id][primary_key_type].append(the_source_dict_id) if "Symbol" in mapping_types: if ((primary_key_type == "Entrez Gene (GeneID)") | ("Entrez Gene (GeneID)" in mapping_types)): the_entrez_to_query = [] # Make query_dict in case "Entrez Gene (GeneID)" was # a primary_key_type but not in mapping_types query_dict = {} for the_source_dict_id in source_dict.keys(): query_dict[the_source_dict_id] = {} query_dict[the_source_dict_id]["Entrez Gene (GeneID)"] = [] if primary_key_type == "Entrez Gene (GeneID)": query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_source_dict_id) if "Entrez Gene (GeneID)" in mapping_types: the_entrez_list = source_dict[the_source_dict_id]["Entrez Gene (GeneID)"] if len(the_entrez_list) > 0: for the_entrez_id in the_entrez_list: if the_entrez_id not in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]: query_dict[the_source_dict_id]["Entrez Gene (GeneID)"].append(the_entrez_id) the_entrez_to_query += query_dict[the_source_dict_id]["Entrez Gene (GeneID)"] the_entrez_to_query = list(set(the_entrez_to_query)) the_symbol_dict = get_entrez_annotation(the_entrez_to_query, email = email, verbose = verbose) for the_source_dict_id in source_dict.keys(): source_dict[the_source_dict_id]["Symbol"] = [] for the_entrez_id in query_dict[the_source_dict_id]["Entrez Gene (GeneID)"]: the_symbol_id = the_symbol_dict[the_entrez_id]['NomenclatureSymbol'] if len(the_symbol_id) > 0: source_dict[the_source_dict_id]["Symbol"].append(the_symbol_id) print("**Finished mapping for %s to %s.**" % (primary_key_type, "Symbol")) return source_dict
def _id_map_(from_annotation, to_annotation, psm_protein_id, psm_hash, species, decoy_annotation, database_v): ''' :param from_annotation: supplied annotation (i.e. swissprot) :param to_annotation: target annotation (i.e. ENSEMBL) :param psm_protein_id: list of protein IDS :param psm_hash: dictionairy of protein IDs mapped onto ENSEMBL :param species: species name :param decoy_annotation: list of decoy annotations :param database_v: database version :return: dictionairy of protein ID coversion ''' #psm_hash.reset() new_psm_protein_id = [] psm_protein_id = list(set(psm_protein_id)) print "Commencing ID conversion from " + str( from_annotation) + " to " + str(to_annotation) map = {} # Convert RefSeq to ENSEMBL ID's if to_annotation == "ENSEMBL": if from_annotation == 'REFSEQ': temp_map = {} refseq_mrna = [] refseq_pred_mrna = [] refseq_ncrna = [] refseq_pred_ncrna = [] refseq_prot = [] refseq_pred_prot = [] for id in psm_protein_id: if "NM_" in id: refseq_mrna.append(id) if "XM_" in id: refseq_pred_mrna.append(id) if "NR_" in id: refseq_ncrna.append(id) if "XR_" in id: refseq_pred_ncrna.append(id) if "NP_" in id: refseq_prot.append(id) if "XP_" in id: refseq_pred_prot.append(id) mapped_id = [] if refseq_mrna != []: print "Identified refseq mRNA ID's, converting:" mapped_id.append( proBAM_biomart.id_map_ensembl("refseq_mrna", database_v, species, refseq_mrna)) if refseq_pred_mrna != []: print "Identified predicted refseq mRNA ID's, converting:" mapped_id.append( proBAM_biomart.id_map_ensembl("refseq_mrna_predicted", database_v, species, refseq_pred_mrna)) if refseq_ncrna != []: print "Identified refseq ncRNA ID's, converting:" mapped_id.append( proBAM_biomart.id_map_ensembl("refseq_ncrna", database_v, species, refseq_ncrna)) if refseq_pred_ncrna != []: print "Identified refseq predicted ncRNA ID's, converting:" mapped_id.append( proBAM_biomart.id_map_ensembl("refseq_ncrna_predicted", database_v, species, refseq_pred_ncrna)) if refseq_prot != []: print "Identified refseq protein ID's, converting:" mapped_id.append( proBAM_biomart.id_map_ensembl("refseq_peptide", database_v, species, refseq_prot)) if refseq_pred_prot != []: print "Identified refseq predicted protein ID's, converting:" mapped_id.append( proBAM_biomart.id_map_ensembl("refseq_peptide_predicted", database_v, species, refseq_pred_prot)) for row in mapped_id: if row[0] != "": if row[2] in temp_map: temp_map[row[2]].append(row) else: temp_map[row[2]] = [row] for key in temp_map: map[key] = temp_map[key][0] #Convert UNIPROT accession ID's to ENSEMBL if from_annotation == "UNIPROT": temp_map = {} #map uniprot/swissprot mapped_id = proBAM_biomart.id_map_ensembl("uniprot_swissprot", database_v, species, psm_protein_id) if len(mapped_id) > 1: for row in mapped_id: if row[0] != "": if row[2] in temp_map: temp_map[row[2]].append(row) else: temp_map[row[2]] = [row] for key in temp_map: map[key] = temp_map[key][0] #map remaining on uniprot/trembl unmapped_id_for_trmbl = [] for id in psm_protein_id: if id not in map: unmapped_id_for_trmbl.append(id) if unmapped_id_for_trmbl != []: mapped_id = proBAM_biomart.id_map_ensembl( "uniprot_sptrembl", database_v, species, unmapped_id_for_trmbl) if len(mapped_id) > 1: for row in mapped_id: if row[0] != "": if row[2] in temp_map: temp_map[row[2]].append(row) else: temp_map[row[2]] = [row] for key in temp_map: map[key] = temp_map[key][0] #Convert UNIPROT ENTRIES to ENSEMBL ID's if from_annotation == "UNIPROT_ENTRY": u = UniProt() to_translate = [] for id in psm_protein_id: if re.findall( "[A-Z0-9]{1,10}" + "_" + _get_uniprot_postfix_(species), id) != []: to_translate.append( re.findall( "[A-Z0-9]{1,10}" + "_" + _get_uniprot_postfix_(species), id)[0]) to_translate = list(set(to_translate)) if len(to_translate) > 1000: nr_chunks = len(to_translate) / 1000 chunks = chunkIt(to_translate, nr_chunks) else: chunks = [to_translate] # map uniprot_entries to up-to-date accession accession_update_hash = {} for chunk in chunks: chunk_accession_update_hash = u.mapping('ACC+ID', 'ACC', chunk) accession_update_hash.update(chunk_accession_update_hash) tot_count = 0 for entry in to_translate: if entry not in accession_update_hash: tot_count += 1 print "\tFound "+str(tot_count)+" depreciated UniProt Entries (UniProt Entries are unstable)\n" \ "\tAttempting to map depreciated entries unto new entry ID's..." count = 0 found = 0 tracker = 9.99 print "\t", for entry in to_translate: if entry not in accession_update_hash: count += 1 if (float(count * 100) / float(tot_count)) > tracker: print str(int(tracker + 0.01)) + "% ", tracker = tracker + 10 try: accession_update_hash[entry] = [ get_updated_entry_name(entry) ] found += 1 except urllib2.HTTPError: pass print ' ' to_translate = [] print "\tRetrieved " + str(found) + " of the " + str( tot_count) + " depreciated UniProt Entries" #map accession to Ensembl transcript for value in accession_update_hash.values(): to_translate += value to_translate = list(set(to_translate)) if len(to_translate) > 1000: nr_chunks = len(to_translate) / 1000 chunks = chunkIt(to_translate, nr_chunks) else: chunks = [to_translate] # map uniprot_entries to up-to-date accession temp_map = {} for chunk in chunks: # remap Ensembl transcript to uniprot entries chunk_temp_map = u.mapping('ACC', 'ENSEMBL_TRS_ID', chunk) temp_map.update(chunk_temp_map) for accession in accession_update_hash: for i in accession_update_hash[accession]: if i in temp_map: if accession in map: map[accession] += temp_map[i] else: map[accession] = temp_map[i] #Get ENSEMBL ID's in correct form if from_annotation == "ENSEMBL": for id in psm_protein_id: map[id] = [id] return map
# Dernière modification: 17 décembre 2013 # # Program qui obtient le uniref correspondant à un no d'accession genbank from bioservices import UniProt import sys import os from BeautifulSoup import BeautifulSoup UNIREF_PATH = "../uniref/" u = UniProt() with open("uniref_mapping.txt", "w") as r: with open("resultatNBCI.txt", "r") as f: for line in f: temp = line.split("-|-") print("Traitement du contig " + temp[0]) accession = temp[2].strip(" \t\n\r") u.mapping(fr='EMBL_ID', to='NF100', query=accession) res = u.search(accession, format='xml', limit=10) if res is '': r.write(temp[0] + "\tNone\n") print "aucun résultat pour ce contig" else: contig = temp[0].strip(" \t\n\r") with open(UNIREF_PATH+"result"+contig+".xml", "w") as xml: xml.write(res) #xml = BeautifulSoup(res) r.write(contig + "\t Result\n")