class PDBe(): """Interface to part of the `PDBe <http://www.ebi.ac.uk/pdbe>`_ service .. doctest:: >>> from bioservices import PDBe >>> s = PDBe() >>> res = s.get_file("1FBV", "pdb") """ def __init__(self, verbose=False, cache=False): """.. rubric:: Constructor :param bool verbose: prints informative messages (default is off) """ url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/" self.services = REST(name="PDBe", url=url, verbose=verbose, cache=cache) def _check_id(self, pdbid): if isinstance(pdbid, list): pdbid = ",".join(pdbid) if isinstance(pdbid, str): for item in pdbid.split(","): assert len(item) == 4, "a 4-character PDB id code is required" else: raise TypeError( "pdb id must be either a 4-character pdb id, a list of valid PDB ids, or a string made of pdb ids, separated by commas" ) return pdbid def _return(self, res): if res == 404: return {} return res def get_summary(self, query): """Returns summary of a PDB entry This can be title of the entry, list of depositors, date of deposition, date of release, date of latest revision, experimental method, list of related entries in case split entries, etc. :param query: a 4-character PDB id code :: p.get_summary('1cbs') p.get_summary('1cbs,2kv8') p.get_summary(['1cbs', '2kv8']) """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("summary/{}".format(query)) else: res = self.services.http_post("summary", data=query, frmt="json") return self._return(res) def get_molecules(self, query): """Return details of molecules (or entities in mmcif-speak) modelled in the entry This can be entity id, description, type, polymer-type (if applicable), number of copies in the entry, sample preparation method, source organism(s) (if applicable), etc. :param query: a 4-character PDB id code :: p.get_molecules('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("molecules/{}".format(query)) else: res = self.services.http_post("molecules", data=query, frmt="json") return self._return(res) def get_related_publications(self, query): """Return publications obtained from both EuroPMC and UniProt. T These are articles which cite the primary citation of the entry, or open-access articles which mention the entry id without explicitly citing the primary citation of an entry. :param query: a 4-character PDB id code :: p.get_related_publications('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get( "related_publications/{}".format(query)) else: res = self.services.http_post("related_publications/", data=query, frmt="json") return self._return(res) def get_experiment(self, query): """Provides details of experiment(s) carried out in determining the structure of the entry. Each experiment is described in a separate dictionary. For X-ray diffraction, the description consists of resolution, spacegroup, cell dimensions, R and Rfree, refinement program, etc. For NMR, details of spectrometer, sample, spectra, refinement, etc. are included. For EM, details of specimen, imaging, acquisition, reconstruction, fitting etc. are included. :param query: a 4-character PDB id code :: p.get_experiment('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("experiment/{}".format(query)) else: res = self.services.http_post("experiment/{}", data=query, frmt="json") return self._return(res) def get_nmr_resources(self, query): """This call provides URLs of available additional resources for NMR entries. E.g., mapping between structure (PDB) and chemical shift (BMRB) entries. :param query: a 4-character PDB id code :: p.get_nmr_resources('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("nmr_resources/{}".format(query)) else: res = self.services.http_post("nmr_resources/", data=query, frmt="json") return self._return(res) def get_ligand_monomers(self, query): """Provides a a list of modelled instances of ligands, ligands i.e. 'bound' molecules that are not waters. :param query: a 4-character PDB id code :: p.get_ligand_monomers('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("ligand_monomers/{}".format(query)) else: res = self.services.http_post("ligand_monomers", data=query, frmt="json") return self._return(res) def get_modified_residues(self, query): """Provides a list of modelled instances of modified amino acids or nucleotides in protein, DNA or RNA chains. :param query: a 4-character PDB id code :: p.get_modified_residues('4v5j') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("modified_AA_or_NA/{}".format(query)) else: res = self.services.http_post("modified_AA_or_NA", data=query, frmt="json") return self._return(res) def get_mutated_residues(self, query): """Provides a list of modelled instances of mutated amino acids or nucleotides in protein, DNA or RNA chains. :param query: a 4-character PDB id code :: p.get_mutated_residues('1bgj') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("mutated_AA_or_NA/{}".format(query)) else: res = self.services.http_get("mutated_AA_or_NA", data=query, frmt="json") return self._return(res) def get_release_status(self, query): """Provides status of a PDB entry (released, obsoleted, on-hold etc) along with some other information such as authors, title, experimental method, etc. :param query: a 4-character PDB id code :: p.get_release_status('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("status/{}".format(query)) else: res = self.services.http_get("status/{}", data=query, frmt="json") return self._return(res) def get_observed_ranges(self, query): """Provides observed ranges, i.e., segments of structural coverage of polymeric molecues that are modelled fully or partly :param query: a 4-character PDB id code :: p.get_observed_ranges('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("polymer_coverage/{}".format(query)) else: res = self.services.http_post("polymer_coverage", data=query, frmt="json") return self._return(res) def get_observed_ranges_in_pdb_chain(self, query, chain_id): """Provides observed ranges, i.e., segments of structural coverage of polymeric molecules in a particular chain :param query: a 4-character PDB id code :param query: a PDB chain ID :: p.get_observed_ranges_in_pdb_chain('1cbs', "A") """ assert len(query) == 4, "a 4-character PDB id code is required" res = self.services.http_get("polymer_coverage/{}/chain/{}".format( query, chain_id)) return self._return(res) def get_secondary_structure(self, query): """Provides residue ranges of regular secondary structure (alpha helices and beta strands) found in protein chains of the entry. For strands, sheet id can be used to identify a beta sheet. :param query: a 4-character PDB id code :: p.get_secondary_structure('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get( "secondary_structure/{}".format(query)) else: res = self.services.http_post("secondary_structure/", data=query, frmt="json") return self._return(res) def get_residue_listing(self, query): """Provides lists all residues (modelled or otherwise) in the entry. Except waters, along with details of the fraction of expected atoms modelled for the residue and any alternate conformers. :param query: a 4-character PDB id code :: p.get_residue_listing('1cbs') """ assert len(query) == 4, "a 4-character PDB id code is required" if isinstance(query, str) and "," not in query: res = self.services.http_get("residue_listing/{}".format(query)) return self._return(res) def get_residue_listing_in_pdb_chain(self, query, chain_id): """Provides all residues (modelled or otherwise) in the entry Except waters, along with details of the fraction of expected atoms modelled for the residue and any alternate conformers. :param query: a 4-character PDB id code :param query: a PDB chain ID :: p.get_residue_listing_in_pdb_chain('1cbs') """ assert len(query) == 4, "a 4-character PDB id code is required" if isinstance(query, str) and "," not in query: res = self.services.http_get("residue_listing/{}".format( query, chain_id)) return self._return(res) def get_binding_sites(self, query): """Pprovides details on binding sites in the entry STRUCT_SITE records in PDB files (or mmcif equivalent thereof), such as ligand, residues in the site, description of the site, etc. :param query: a 4-character PDB id code :: p.get_binding_sites('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("binding_sites/{}".format(query)) else: res = self.services.http_post("binding_sites", data=query, frmt="json") return self._return(res) def get_files(self, query): """Provides URLs and brief descriptions (labels) for PDB entry Also, for mmcif files, biological assembly files, FASTA file for sequences, SIFTS cross reference XML files, validation XML files, X-ray structure factor file, NMR experimental constraints files, etc. :param query: a 4-character PDB id code :: p.get_files('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("files/{}".format(query)) else: res = self.services.http_post("files", data=query, frmt="json") return self._return(res) def get_observed_residues_ratio(self, query): """Provides the ratio of observed residues for each chain in each molecule The list of chains within an entity is sorted by observed_ratio (descending order), partial_ratio (ascending order), and number_residues (descending order). :param query: a 4-character PDB id code :: p.get_observed_residues_ratio('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get( "observed_residues_ratio/{}".format(query)) else: res = self.services.http_post("observed_residues_ratio", data=query, frmt="json") return self._return(res) def get_assembly(self, query): """Provides information for each assembly of a given PDB ID. T This information is broken down at the entity level for each assembly. The information given includes the molecule name, type and class, the chains where the molecule occur, and the number of copies of each entity in the assembly. :param query: a 4-character PDB id code :: p.get_assembly('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("assembly/{}".format(query)) else: res = self.services.http_post("assembly", data=query, frmt="json") return self._return(res) def get_electron_density_statistics(self, query): """This call details the statistics for electron density. :param query: a 4-character PDB id code :: p.get_electron_density_statistics('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get( "electron_density_statistics/{}".format(query)) else: res = self.services.http_post("electron_density_statistics", data=query, frmt="json") return self._return(res) def get_functional_annotation(self, query): """Provides functional annotation of all ligands, i.e. 'bound' :param query: a 4-character PDB id code :: p.get_functional_annotation('1cbs') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("cofactor/{}".format(query)) else: res = self.services.http_post("cofactor", data=query, frmt="json") return self._return(res) def get_drugbank_annotation(self, query): """This call provides DrugBank annotation of all ligands, i.e. 'bound' :param query: a 4-character PDB id code :: p.get_drugbank_annotation('5hht') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get("drugbank/{}".format(query)) else: res = self.services.http_post("drugbank", data=query, frmt="json") return self._return(res) def get_related_dataset(self, query): """Provides DOI’s for related raw experimental datasets Includes diffraction image data, small-angle scattering data and electron micrographs. :param query: a 4-character PDB id code :: p.get_cofactor('5o8b') """ query = self._check_id(query) if isinstance(query, str) and "," not in query: res = self.services.http_get( "related_experiment_data/{}".format(query)) else: res = self.services.http_post("related_experiment_data", data=query, frmt="json") return self._return(res)
class Panther(): """Interface to `Panther <http://www.pantherdb.org/services/oai/pantherdb>`_ pages :: >>> from bioservics import Panther >>> p = Panther() >>> p.get_supported_genomes() >>> p.get_ortholog("zap70", 9606) >>> from bioservics import Panther >>> p = Panther() >>> taxon = [x[0]['taxon_id'] for x in p.get_supported_genomes() if "coli" in x['name'].lower()] >>> # you may also use our method called search_organism >>> taxon = p.get_taxon_id(pattern="coli") >>> res = p.get_mapping("abrB,ackA,acuI", taxon) The get_mapping returns for each gene ID the GO terms corresponding to each ID. Those go terms may belong to different categories (see meth:`get_annotation_datasets`): - MF for molecular function - BP for biological process - PC for Protein class - CC Cellular location - Pathway Note that results from the website application http://pantherdb.org/ do not agree with the oupput of the get_mapping service... Try out the dgt gene from ecoli for example """ _url = "http://www.pantherdb.org/services/oai/pantherdb" def __init__(self, verbose=True, cache=False): """**Constructor** :param verbose: set to False to prevent informative messages """ #super(Panther, self).__init__(name="Panther", url=Panther._url, # verbose=verbose, cache=cache) self.services = REST(name="Panther", url=Panther._url, verbose=verbose, cache=cache) self._allPathwaysURL = "http://www.pantherdb.org/pathway/pathwayList.jsp" def get_pathways(self): """Returns all pathways from pantherdb""" return self.services.http_get("supportedpantherpathways") def get_supported_genomes(self, type=None): """Returns list of supported organisms. :param type: can be chrLoc to restrict the search """ if type is not None: params = {'type': type} else: params = {} res = self.services.http_get("supportedgenomes", params=params) res = [x for x in res["search"]["output"]["genomes"]['genome']] return res def get_taxon_id(self, pattern=None): """return all taxons supported by the service If pattern is provided, we filter the name to keep those that contain the filter. If only one is found, we return the name itself, otherwise a list of candidates """ res = self.get_supported_genomes() if pattern: taxon = [ x['taxon_id'] for x in res if pattern.lower() in x['name'].lower() ] if len(taxon) == 1: return taxon[0] else: return taxon else: taxon = [x["taxon_id"] for x in res] return taxon def get_mapping(self, gene_list, taxon): """Map identifiers Each identifier to be delimited by comma i.e. ',. Maximum of 1000 Identifiers can be any of the following: Ensemble gene identifier, Ensemble protein identifier, Ensemble transcript identifier, Entrez gene id, gene symbol, NCBI GI, HGNC Id, International protein index id, NCBI UniGene id, UniProt accession and UniProt id :param gene_list: see above :param taxon: one taxon ID. See supported :meth:`~bioservices.panther.Panther.get_supported_genomes` If an identifier is not found, information can be found in the unmapped_genes key while found identifiers are in the mapped_genes key. .. warning:: found and not found identifiers are dispatched into unmapped and mapped genes. If there are not found identifiers, the input gene list and the mapped genes list do not have the same length. The input names are not stored in the output. Developpers should be aware of that feature. """ params = {"geneInputList": gene_list, "organism": taxon} res = self.services.http_post("geneinfo", params=params, frmt='json') if "mapped_genes" in res['search']: mapped_genes = res['search']['mapped_genes']['gene'] # if only one identifier, retuns a dictionary. # if several identifiers, returns a list of dictionary. # We will be consistent and return a list if "accession" in mapped_genes: mapped_genes = [mapped_genes] else: mapped_genes = [{}] if "unmapped_list" in res['search']: unmapped_genes = res['search']['unmapped_list']["unmapped"] if isinstance(unmapped_genes, list): pass else: unmapped_genes = [unmapped_genes] else: unmapped_genes = [] logger.warning("Some identifiers were not found") return {"unmapped": unmapped_genes, "mapped": mapped_genes} def get_enrichment(self, gene_list, organism, annotation, enrichment_test="Fisher", correction="FDR", ref_gene_list=None): """Returns over represented genes Compares a test gene list to a reference gene list, and determines whether a particular class (e.g. molecular function, biological process, cellular component, PANTHER protein class, the PANTHER pathway or Reactome pathway) of genes is overrepresented or underrepresented. :param organism: a valid taxon ID :param enrichment_test: either **Fisher** or **Binomial** test :param correction: correction for multiple testing. Either **FDR**, **Bonferonni**, or **None**. :param annotation: one of the supported PANTHER annotation data types. See :meth:`~bioservices.panther.Panther.get_annotation_datasets` to retrieve a list of supported annotation data types :param ref_gene_list: if not specified, the system will use all the genes for the specified organism. Otherwise, a list delimited by comma. Maximum of 100000 Identifiers can be any of the following: Ensemble gene identifier, Ensemble protein identifier, Ensemble transcript identifier, Entrez gene id, gene symbol, NCBI GI, HGNC Id, International protein index id, NCBI UniGene id, UniProt accession andUniProt id. :return: a dictionary with the following keys. 'reference' contains the orgnaism, 'input_list' is the input gene list with unmapped genes. 'result' contains the list of candidates. :: >>> from bioservices import Panther >>> p = Panther() >>> res = p.get_enrichment('zap70,mek1,erk', 9606, "GO:0008150") >>> For molecular function, use : >>> res = p.get_enrichment('zap70,mek1,erk', 9606, "ANNOT_TYPE_ID_PANTHER_GO_SLIM_MF") """ assert enrichment_test.lower() in ['fisher', 'binomial'] if correction is None: correction = 'none' assert correction.lower() in ['fdr', 'bonferroni', 'none'] # This is a bug in panther DB where they used bonferonni . should be # bonferroni... if correction.lower() == "bonferroni": correction = "bonferonni" assert annotation in [x['id'] for x in self.get_annotation_datasets()] params = {'enrichmentTestType': enrichment_test.upper()} params['organism'] = organism if gene_list: params['geneInputList'] = gene_list if ref_gene_list: params['refInputList'] = ref_gene_list params['annotDataSet'] = annotation params['correction'] = correction.upper() try: res = self.services.http_post("enrich/overrep", params=params, frmt="json") try: return res['results'] except: return res except: return res def get_annotation_datasets(self): """Retrieve the list of supported annotation data sets""" res = self.services.http_get("supportedannotdatasets") res = res["search"]["annotation_data_sets"]["annotation_data_type"] return res def get_ortholog(self, gene_list, organism, target_organism=None, ortholog_type="all"): """search for matching orthologs in target organisms. Searches for matching orthologs in the gene family that contains the search gene associated with the search terms. Returns ortholog genes in target organisms given a search organism, the search terms and a list of target organisms. :param gene_list: :param organism: a valid taxon ID :param target_organism: zero or more taxon IDs separated by ','. See :meth:`~bioservices.panther.Panther.get_supported_genomes` :param ortholog_type: optional parameter to specify ortholog type of target organism :return: a dictionary with "mapped" and "unmapped" keys, each of them being a list. For each unmapped gene, a dictionary with id and organism is is returned. For the mapped gene, a list of ortholog is returned. """ assert ortholog_type in ['LDO', 'all'] params = { "geneInputList": gene_list, "organism": organism, "targetOrganism": target_organism, "orthologType": ortholog_type } if params['targetOrganism'] is None: del params['targetOrganism'] res = self.services.http_get("ortholog/matchortho", frmt='json', params=params) res = res['search']['mapping'] mapped = res['mapped'] try: unmapped = res['unmapped_ids']['unmapped'] # make sure we always have a list if isinstance(unmapped, dict): unmapped = [unmapped] except: unmapped = [] res = {"unmapped": unmapped, "mapped": mapped} return res def get_homolog_position(self, gene, organism, position, ortholog_type="all"): """ :param gene: Can be any of the following: Ensemble gene identifier, Ensemble protein identifier, Ensemble transcript identifier, Entrez gene id, gene symbol, NCBI GI, HGNC Id, International protein index id, NCBI UniGene id, UniProt accession andUniProt id :param organism: a valid taxon ID :param ortholog_type: optional parameter to specify ortholog type of target organism """ if "," in gene: logger.warning( "did not expect a comma. Please provide only one gene name") assert ortholog_type in ['LDO', 'all'] assert position >= 1 params = { "gene": gene, "organism": organism, "pos": position, "orthologType": ortholog_type } res = self.services.http_get("ortholog/homologpos", params=params, frmt="json") res = res['search']['mapping'] if "mapped" in res.keys(): res = res['mapped'] return res elif "unmapped_ids" in res.keys(): logger.warning("did not find any match for {}".format(gene)) return res["unmapped_ids"] def get_supported_families(self, N=1000, progress=True): """Returns the list of supported PANTHER family IDs This services returns only 1000 items per request. This is defined by the index. For instance index set to 1 returns the first 1000 families. Index set to 2 returns families between index 1000 and 2000 and so on. As of 20 Feb 2020, there was about 15,000 families. This function simplifies your life by calling the service as many times as required. Therefore it returns all families in one go. """ from easydev import Progress params = {'startIndex': 1} res = self.services.http_get("supportedpantherfamilies", params=params) results = res['search']['panther_family_subfam_list']['family'] if len(results) != N: msg = "looks like the services changed. Call this function with N={}" msg = msg.format(len(results)) raise ValueError(msg) number_of_families = res['search']['number_of_families'] pb = Progress(int(number_of_families / N)) pb.animate(1) for i in range(1, int(number_of_families / N) + 1): params = {'startIndex': i * N + 1} res = self.services.http_get("supportedpantherfamilies", params=params) data = res['search']['panther_family_subfam_list']['family'] results.extend(data) if progress: pb.animate(i) return results def get_family_ortholog(self, family, taxon_list=None): """Search for matching orthologs in target organisms Also return the corresponding position in the target organism sequence. The system searches for matching orthologs in the gene family that contains the search gene associated with the search term. :param family: Family ID :param taxon_list: Zero or more taxon IDs separated by ','. """ params = {"family": family} if taxon_list: params['taxonFltr'] = taxon_list res = self.services.http_get("familyortholog", params=params, frmt="json") return res['search']['ortholog_list']['ortholog'] def get_family_msa(self, family, taxon_list=None): """Returns MSA information for the specified family. :param family: family ID :param taxon_list: Zero or more taxon IDs separated by ','. """ params = {"family": family} if taxon_list: params['taxonFltr'] = taxon_list res = self.services.http_get("familymsa", params=params, frmt="json") return res['search']['MSA_list']['sequence_info'] def get_tree_info(self, family, taxon_list=None): """Returns tree topology information and node attributes for the specified family. :param family: Family ID :param taxon_list: Zero or more taxon IDs separated by ','. """ params = {"family": family} if taxon_list: params['taxonFltr'] = taxon_list res = self.services.http_get("treeinfo", params=params, frmt="json") return res['search'] #['tree_topology']['annotation_node']
class MyGeneInfo(): """Interface to `mygene.infoe <http://mygene.info>`_ service .. doctest:: >>> from bioservices import MyGeneInfo >>> s = MyGeneInfoe() """ def __init__(self, verbose=False, cache=False): """.. rubric:: Constructor :param bool verbose: prints informative messages (default is off) """ url = "https://mygene.info/v3" self.services = REST(name="PDBe", url=url, verbose=verbose, cache=cache) def get_genes(self, ids, fields="symbol,name,taxid,entrezgene,ensemblgene", species=None, dotfield=True, email=None): """Get matching gene objects for a list of gene ids :param ids: list of geneinfo IDs :param str fields: a comma-separated fields to limit the fields returned from the matching gene hits. The supported field names can be found from any gene object (e.g. http://mygene.info/v3/gene/1017). Note that it supports dot notation as well, e.g., you can pass "refseq.rna". If "fields=all", all available fields will be returned. Default: "symbol,name,taxid,entrezgene,ensemblgene". :param str species: can be used to limit the gene hits from given species. You can use "common names" for nine common species (human, mouse, rat, fruitfly, nematode, zebrafish, thale-cress, frog and pig). All other species, you can provide their taxonomy ids. Multiple species can be passed using comma as a separator. Default: human,mouse,rat. :param dotfield: control the format of the returned fields when passed "fields" parameter contains dot notation, e.g. "fields=refseq.rna". If True the returned data object contains a single "refseq.rna" field, otherwise (False), a single "refseq" field with a sub-field of "rna". Default: True. :param str email": If you are regular users of this services, the mygeneinfo maintainers/authors encourage you to provide an email, so that we can better track the usage or follow up with you. :: mgi = MyGeneInfoe() mgi.get_genes(("301345,22637")) # first one is rat, second is mouse. This will return a 'notfound' # entry and the second entry as expected. mgi.get_genes("301345,22637", species="mouse") """ params = {"ids": ids, "fields": fields} if email: # pragma: no cover params["email"] = email assert dotfield in [True, False] params["dotfield"] = dotfield if species: params["species"] = species res = self.services.http_post( "gene", #params=params, data=params, frmt="json", headers={ "User-Agent": self.services.getUserAgent(), "accept": "application/json", "Content-Type": "application/x-www-form-urlencoded" }) return res def get_one_gene(self, geneid, fields="symbol,name,taxid,entrezgene,ensemblgene", dotfield=True, email=None): """Get matching gene objects for one gene id :param geneid: a valid gene ID :param str fields: a comma-separated fields to limit the fields returned from the matching gene hits. The supported field names can be found from any gene object (e.g. http://mygene.info/v3/gene/1017). Note that it supports dot notation as well, e.g., you can pass "refseq.rna". If "fields=all", all available fields will be returned. Default: "symbol,name,taxid,entrezgene,ensemblgene". :param dotfield: control the format of the returned fields when passed "fields" parameter contains dot notation, e.g. "fields=refseq.rna". If True the returned data object contains a single "refseq.rna" field, otherwise (False), a single "refseq" field with a sub-field of "rna". Default: True. :param str email": If you are regular users of this services, the mygeneinfo maintainers/authors encourage you to provide an email, so that we can better track the usage or follow up with you. :: mgi = MyGeneInfoe() mgi.get_genes("301345") """ params = {"ids": geneid, "fields": fields} if email: # pragma: no cover params["email"] = email assert dotfield in [True, False] params["dotfield"] = dotfield res = self.services.http_get(f"gene/{geneid}", params=params, frmt="json") return res def get_one_query(self, query, email=None, dotfield=True, fields="symbol,name,taxid,entrezgene,ensemblgene", species="human,mouse,rat", size=10, _from=0, sort=None, facets=None, entrezonly=False, ensemblonly=False): """Make gene query and return matching gene list. Support JSONP and CORS as well. :param str query: Query string. Examples "CDK2", "NM_052827", "204639_at", "chr1:151,073,054-151,383,976", "hg19.chr1:151073054-151383976". The detailed query syntax can be found from our docs. :param str fields: a comma-separated fields to limit the fields returned from the matching gene hits. The supported field names can be found from any gene object (e.g. http://mygene.info/v3/gene/1017). Note that it supports dot notation as well, e.g., you can pass "refseq.rna". If "fields=all", all available fields will be returned. Default: "symbol,name,taxid,entrezgene,ensemblgene". :param str species: can be used to limit the gene hits from given species. You can use "common names" for nine common species (human, mouse, rat, fruitfly, nematode, zebrafish, thale-cress, frog and pig). All other species, you can provide their taxonomy ids. Multiple species can be passed using comma as a separator. Default: human,mouse,rat. :param int size: the maximum number of matching gene hits to return (with a cap of 1000 at the moment). Default: 10. :param int _from: the number of matching gene hits to skip, starting from 0. Combining with "size" parameter, this can be useful for paging. Default: 0. :param sort: the comma-separated fields to sort on. Prefix with "-" for descending order, otherwise in ascending order. Default: sort by matching scores in decending order. :param str facets: a single field or comma-separated fields to return facets, for example, "facets=taxid", "facets=taxid,type_of_gene". :param bool entrezonly: when passed as True, the query returns only the hits with valid Entrez gene ids. Default: False. :param bool ensembleonly: when passed as True, the query returns only the hits with valid Ensembl gene ids. Default: False. :param dotfield: control the format of the returned fields when passed "fields" parameter contains dot notation, e.g. "fields=refseq.rna". If True the returned data object contains a single "refseq.rna" field, otherwise (False), a single "refseq" field with a sub-field of "rna". Default: True. :param str email": If you are regular users of this services, the mygeneinfo maintainers/authors encourage you to provide an email, so that we can better track the usage or follow up with you. """ params = {"fields": fields, "size": size, "from": _from} if email: # pragma: no cover params["email"] = email assert dotfield in [True, False] params["dotfield"] = dotfield if sort: params["sort"] = sort if facets: # pragma: no cover params["facets"] = sort assert entrezonly in [True, False] params["entrezonly"] = entrezonly assert ensemblonly in [True, False] params["ensemblonly"] = entrezonly res = self.services.http_get(f"query?q={query}", params=params, frmt="json") return res def get_queries( self, query, email=None, dotfield=True, scopes="all", species="human,mouse,rat", fields="symbol,name,taxid,entrezgene,ensemblgene", ): """Make gene query and return matching gene list. Support JSONP and CORS as well. :param str query: Query string. Examples "CDK2", "NM_052827", "204639_at", "chr1:151,073,054-151,383,976", "hg19.chr1:151073054-151383976". The detailed query syntax can be found from our docs. :param str fields: a comma-separated fields to limit the fields returned from the matching gene hits. The supported field names can be found from any gene object (e.g. http://mygene.info/v3/gene/1017). Note that it supports dot notation as well, e.g., you can pass "refseq.rna". If "fields=all", all available fields will be returned. Default: "symbol,name,taxid,entrezgene,ensemblgene". :param str species: can be used to limit the gene hits from given species. You can use "common names" for nine common species (human, mouse, rat, fruitfly, nematode, zebrafish, thale-cress, frog and pig). All other species, you can provide their taxonomy ids. Multiple species can be passed using comma as a separator. Default: human,mouse,rat. :param dotfield: control the format of the returned fields when passed "fields" parameter contains dot notation, e.g. "fields=refseq.rna". If True the returned data object contains a single "refseq.rna" field, otherwise (False), a single "refseq" field with a sub-field of "rna". Default: True. :param str email": If you are regular users of this services, the mygeneinfo maintainers/authors encourage you to provide an email, so that we can better track the usage or follow up with you. :param str scopes: not documented. Set to 'all' """ params = {"q": query, "fields": fields, "scopes": scopes} if email: # pragma: no cover params["email"] = email assert dotfield in [True, False] params["dotfield"] = dotfield res = self.services.http_post("query", params=params, frmt="json", headers={ "User-Agent": self.services.getUserAgent(), "accept": "application/json", "Content-Type": "application/x-www-form-urlencoded" }) return res def get_metadata(self): res = self.services.http_get(f"metadata", frmt="json") return res def get_taxonomy(self): res = self.services.http_get(f"metadata", frmt="json") return res['taxonomy']
class Seqret(): """Interface to the `Seqret <http://www.ebi.ac.uk/readseq>`_ service :: >>> from bioservices import * >>> s = Seqret() The ReadSeq service was replaced by #the Seqret services (2015). .. versionchanged:: 0.15 """ def __init__(self, verbose=True): """.. rubric:: Constructor :param bool verbose: """ url = "https://www.ebi.ac.uk/Tools/services/rest/emboss_seqret" self.services = REST(name="seqret", url=url, verbose=verbose) self._parameters = None def get_parameters(self): """Get a list of the parameter names. :returns: a list of strings giving the names of the parameters. """ parameters = self.services.http_get("parameters", frmt="json") return parameters['parameters'] def _get_parameters(self): if self._parameters: return self._parameters else: res = self.get_parameters() self._parameters = res return self._parameters parameters = property(_get_parameters, doc="Get list of parameter names") def get_parameter_details(self, parameterId): """Get details of a specific parameter. :param str parameter: identifier/name of the parameter to fetch details of. :return: a data structure describing the parameter and its values. :: rs = ReadSeq() print(rs.get_parameter_details("stype")) """ if parameterId not in self.parameters: raise ValueError( "Invalid parameterId provided(%s). See parameters attribute" % parameterId) request = "parameterdetails/" + parameterId res = self.services.http_get(request, frmt="json") return res def run(self, email, title, **kargs): """Submit a job to the service. :param str email: user e-mail address. :param str title: job title. :param params: parameters for the tool as returned by :meth:`get_parameter_details`. :return: string containing the job identifier (jobId). Deprecated (olf readseq service):: Format Name Value Auto-detected 0 EMBL 4 GenBank 2 Fasta(Pearson) 8 Clustal/ALN 22 ACEDB 25 BLAST 20 DNAStrider 6 FlatFeat/FFF 23 GCG 5 GFF 24 IG/Stanford 1 MSF 15 NBRF 3 PAUP/NEXUS 17 Phylip(Phylip4) 12 Phylip3.2 11 PIR/CODATA 14 Plain/Raw 13 SCF 21 XML 19 As output, you also have Pretty 18 :: s = readseq.Seqret() jobid = s.run("*****@*****.**", "test", sequence=fasta, inputformat=8, outputformat=2) genbank = s.get_result(s._jobid) """ for k in kargs.keys(): self.services.devtools.check_param_in_list(k, self.parameters) assert "sequence" in kargs.keys() params = {"email": email, "title": title} for k in [ 'stype', 'inputformat', 'outputformat', "feature", "firstonly", "reverse", 'outputcase', 'seqrange' ]: if k in kargs.keys(): value = kargs.get(k) details = self.get_parameter_details(k) valid_values = [ x['value'] for x in details['values']['values'] ] self.services.devtools.check_param_in_list( str(value), valid_values) params[k] = value #r = requests.post(url + "/run?", data={"sequence":fasta, "stype": "protein", #"inputformat":"raw", "outputformat":"fasta", "email":"*****@*****.**", #"title":"test"}) params['sequence'] = kargs['sequence'] jobid = self.services.http_post("run", frmt="txt", data=params) self._jobid = jobid return jobid def get_status(self, jobid=None): """Get the status of a submitted job. :param str jobid: job identifier. :return: string containing the status. The values for the status are: - RUNNING: the job is currently being processed. - FINISHED: job has finished, and the results can then be retrieved. - ERROR: an error occurred attempting to get the job status. - FAILURE: the job failed. - NOT_FOUND: the job cannot be found. """ res = self.services.http_get("status/{}".format(jobid), frmt="txt") return res def get_result_types(self, jobid): """Get the available result types for a finished job. :param str jobid: job identifier. :return: a list of wsResultType data structures describing the available result types. """ res = self.services.http_get("resulttypes/{}".format(jobid), frmt="json") return [x['identifier'] for x in res["types"]] def get_result(self, jobid, result_type="out"): """Get the result of a job of the specified type. :param str jobid: job identifier. :param parameters: optional list of wsRawOutputParameter used to provide additional parameters for derived result types. """ if self.get_status(jobid) != 'FINISHED': self.services.logging.warning( "Your job is not finished yet. Try again later.") return #result_types = self.get_result_types(jobid) #assert parameters in result_types res = self.services.http_get("result/{}/{}".format(jobid, result_type), frmt="txt") return res
class PDB(): """Interface to part of the `PDB <http://www.rcsb.org/pdb>`_ service :Status: in progress not for production. You can get all ID and retrieve uncompressed file in PDB/FASTA formats for now. New features will be added on request. .. doctest:: >>> from bioservices import PDB >>> s = PDB() >>> res = s.get_file("1FBV", "pdb") """ def __init__(self, verbose=False, cache=False): """.. rubric:: Constructor :param bool verbose: prints informative messages (default is off) """ url = "http://www.rcsb.org/pdb/rest" self.services = REST(name="PDB", url=url, verbose=verbose, cache=cache) def search(self, query): """ <?xml version="1.0" encoding="UTF-8"?> <orgPdbQuery> <version>B0907</version> <queryType>org.pdb.query.simple.ExpTypeQuery</queryType> <description>Experimental Method Search : Experimental Method=SOLID-STATE NMR</description> <mvStructure.expMethod.value>SOLID-STATE NMR</mvStructure.expMethod.value> </orgPdbQuery> """ res = self.http_post("search", frmt="xml", data=query) return res def get_current_ids(self): """Get a list of all current PDB IDs.""" res = self.services.http_get("getCurrent", frmt="xml") res = self.services.easyXML(res) res = [x.attrib['structureId'] for x in res.getchildren()] return res def get_file(self, identifier, frmt, compression=False, headerOnly=False): """Download a file in a specified format :param int identifier: a valid Identifier. See :meth:`get_current_ids`. :param str fileFormat: a valid format in "pdb", "cif", "xml" .. doctest:: >>> from bioservices import PDB >>> s = PDB() >>> res = s.get_file("1FBV", "pdb") >>> import tempfile >>> fh = tempfile.NamedTemporaryFile() >>> fh.write(res) >>> # manipulate the PDB file with your favorite tool >>> # close the file ONLY when finished (this is temporary file) >>> # fh.close() reference: http://www.rcsb.org/pdb/static.do?p=download/http/index.html """ valid_formats = ["pdb", "cif", "xml"] self.services.devtools.check_param_in_list(frmt, valid_formats) self.services.devtools.check_param_in_list(headerOnly, [True, False]) if headerOnly is True: headerOnly = "YES" else: headerOnly = "NO" query = "files/" + identifier + "." + frmt if compression is True: query += ".gz" params = {'headerOnly': headerOnly} if frmt == "xml": res = self.services.http_get(query, frmt=frmt, params=params) if compression is False: res = self.easyXML(res) else: res = self.services.http_get(query, frmt="txt", params=params) return res def get_ligands(self, identifier): """List the ligands that can be found in a PDB entry :param identifier: a valid PDB identifier (e.g., 4HHB) :return: xml document >>> from bioservices import PDB >>> s = PDB() >>> s.get_ligands("4HHB") Then, :: x = s.get_ligands("4HHB") from pyquery import PyQuery as pq d = pq(x) """ res = self.services.http_get("rest/ligandInfo", frmt='xml', params={'structureId': identifier}) return res def get_xml_query(self, query): """Send an XML query query = '<?xml version="1.0" encoding="UTF-8"?> <orgPdbQuery> <version>B0907</version> <queryType>org.pdb.query.simple.ExpTypeQuery</queryType> <description>Experimental Method Search : Experimental Method=SOLID-STATE NMR</description> <mvStructure.expMethod.value>SOLID-STATE NMR</mvStructure.expMethod.value> </orgPdbQuery> ' """ res = self.services.http_post( "query/post", data=query, headers=self.services.get_headers(content='default')) return res def get_go_terms(self, query): res = self.services.http_get("goTerms", params={"structureId": query}, frmt="xml") res = self.services.easyXML(res) try: return res.content except: return res def get_ligand_info(self, query): res = self.services.http_get("ligandInfo", params={"structureId": query}, frmt="xml") res = self.services.easyXML(res) try: return res.content except: return res
class NCBIblast(): """Interface to the `NCBIblast <http://blast.ncbi.nlm.nih.gov/>`_ service. :: >>> from bioservices import * >>> s = NCBIblast(verbose=False) >>> jobid = s.run(program="blastp", sequence=s._sequence_example, stype="protein", database="uniprotkb", email="name@provider") >>> s.getResult(jobid, "out") .. warning:: It is very important to provide a real e-mail address as your job otherwise very likely will be killed and your IP, Organisation or entire domain black-listed. When running a blast request, a program is required. You can obtain the list using:: >>> s.parametersDetails("program") [u'blastp', u'blastx', u'blastn', u'tblastx', u'tblastn'] * blastn: Search a nucleotide database using a nucleotide query * blastp: Search protein database using a protein query * blastx: Search protein database using a translated nucleotide query * tblastn Search translated nucleotide database using a protein query * tblastx Search translated nucleotide database using a translated nucleotide query """ _sequence_example = "MDSTNVRSGMKSRKKKPKTTVIDDDDDCMTCSACQSKLVKISDITKVSLDYINTMRGNTLACAACGSSLKLLNDFAS" def __init__(self, verbose=False): """.. rubric:: NCBIblast constructor :param bool verbose: prints informative messages """ url = "http://www.ebi.ac.uk/Tools/services/rest/ncbiblast" self.services = REST(name="NCBIblast", url=url, verbose=verbose) self._parameters = None self._parametersDetails = {} self.checkInterval = 2 def get_parameters(self): """List parameter names. :returns: An XML document containing a list of parameter names. :: >>> from bioservices import ncbiblast >>> n = ncbiblast.NCBIblast() >>> res = n.get_parameters() >>> [x.text for x in res.findAll("id")] .. seealso:: :attr:`parameters` to get a list of the parameters without need to process the XML output. """ res = self.services.http_get("parameters", frmt="json", headers={ "User-Agent": self.services.getUserAgent(), "Accept": "application/json" }) return res['parameters'] def _get_parameters(self): if self._parameters: return self._parameters else: # on 2 lines in case it fails, self._parameters remaisn None res = self.get_parameters() self._parameters = res return self._parameters parameters = property(_get_parameters) def get_parameter_details(self, parameterId): """Get detailed information about a parameter. :returns: An XML document providing details about the parameter or a list of values that can take the parameters if the XML could be parsed. For example:: >>> s.parameter_details("matrix") [u'BLOSUM45', u'BLOSUM50', u'BLOSUM62', u'BLOSUM80', u'BLOSUM90', u'PAM30', u'PAM70', u'PAM250'] """ if parameterId not in self.parameters: raise ValueError( "Invalid parameterId provided(%s). See parameters attribute" % parameterId) if parameterId not in self._parametersDetails.keys(): request = "parameterdetails/" + parameterId res = self.services.http_get(request, frmt="json", headers={ "User-Agent": self.services.getUserAgent(), "Accept": "application/json" }) try: data = [x['value'] for x in res["values"]["values"]] except: data = res self._parametersDetails[parameterId] = data return self._parametersDetails[parameterId] def run(self, program=None, database=None, sequence=None, stype="protein", email=None, **kargs): """ Submit a job with the specified parameters. .. python ncbiblast_urllib2.py -D ENSEMBL --email "*****@*****.**" --sequence .. MDSTNVRSGMKSRKKKPKTTVIDDDDDCMTCSACQSKLVKISDITKVSLDYINTMRGNTLACAACGSSLKLLNDFAS .. --program blastp --database uniprotkb .. rubric:: Compulsary arguments :param str program: BLAST program to use to perform the search (e.g., blastp) :param str sequence: query sequence. The use of fasta formatted sequence is recommended. :param list database: list of database names for search or possible a single string (for one database). There are some mismatch between the output of parametersDetails("database") and the accepted values. For instance UniProt Knowledgebase should be given as "uniprotkb". :param str email: a valid email address. Will be checked by the service itself. .. rubric:: Optional arguments. If not provided, a default value will be used :param str type: query sequence type in 'dna', 'rna' or 'protein' (default is protein). :param str matrix: scoring matrix to be used in the search (e.g., BLOSUM45). :param bool gapalign: perform gapped alignments. :param int alignments: maximum number of alignments displayed in the output. :param exp: E-value threshold. :param bool filter: low complexity sequence filter to process the query sequence before performing the search. :param int scores: maximum number of scores displayed in the output. :param int dropoff: amount score must drop before extension of hits is halted. :param match_scores: match/miss-match scores to generate a scoring matrix for nucleotide searches. :param int gapopen: penalty for the initiation of a gap. :param int gapext: penalty for each base/residue in a gap. :param seqrange: region of the query sequence to use for the search. Default: whole sequence. :return: A jobid that can be analysed with :meth:`getResult`, :meth:`getStatus`, ... The up to data values accepted for each of these parameters can be retrieved from the :meth:`get_parameter_details`. For instance,:: from bioservices import NCBIblast n = NCBIblast() n.get_parameter_details("program") Example:: jobid = n.run(program="blastp", sequence=n._sequence_example, stype="protein", database="uniprotkb", email="*****@*****.**") Database can be a list of databases:: database=["uniprotkb", "uniprotkb_swissprot"] The returned object is a jobid, which status can be checked. It must be finished before analysing/geeting the results. .. seealso:: :meth:`getResult` .. warning:: Cases are not important. Spaces in the database case should be replaced by underscore. .. note:: database returned by the server have meaningless names since they do not map to the expected names. An example is "ENA Sequence Release" that should be provided as em_rel http://www.ebi.ac.uk/Tools/sss/ncbiblast/help/index-nucleotide.html """ # There are compulsary arguments: if program is None or sequence is None or database is None or email is None: raise ValueError( "program, sequence, email and database must be provided") checkParam = self.services.devtools.check_param_in_list # Here, we will check the arguments values (not the type) # Arguments will be checked by the service itself but if we can # catch some before, it is better checkParam(program, self.get_parameter_details("program")) checkParam(stype, ["protein", "dna", "rna"]) # So far, we have these parameters params = { 'program': program, 'sequence': sequence, 'email': email, 'stype': stype } # all others are optional (actually type is also optional) # We can check all of the optional argument provided automatically. # this is fine for now but note for instance that stype could not be put # here because what is returned by parametersDetails is not exactly what # is expected. for k, v in kargs.items(): #print(k, v) checkParam(v, self.get_parameter_details(k)) params[k] = v # similarly for the database, we must process it by hand because ther # can be more than one database #checkParam(database.lower(), [str(x.replace(" ", "_").lower()) # for x in self.parametersDetails("database")]) if isinstance(database, list): databases = database[:] elif isinstance(database, str): databases = [database] else: raise TypeError("database must be a string or a list of strings") params['database'] = databases """ parser.add_option('--seqrange', help='region within input to use as query') # General options parser.add_option('--title', help='job title') parser.add_option('--outfile', help='file name for results') parser.add_option('--outformat', help='output format for results') parser.add_option('--async', action='store_true', help='asynchronous mode') parser.add_option('--jobid', help='job identifier') parser.add_option('--polljob', action="store_true", help='get job result') parser.add_option('--status', action="store_true", help='get job status') parser.add_option('--resultTypes', action='store_true', help='get result types') """ # IMPORTANT: use data parameter, not params !!! res = self.services.http_post("run", frmt=None, data=params, headers={ "User-Agent": self.services.getUserAgent(), "accept": "text/plain" }) return res def get_status(self, jobid): """Get status of a submitted job :param str jobid: :param str jobid: a job identifier returned by :meth:`run`. :return: A string giving the jobid status (e.g. FINISHED). The values for the status are: * RUNNING: the job is currently being processed. * FINISHED: job has finished, and the results can then be retrieved. * ERROR: an error occurred attempting to get the job status. * FAILURE: the job failed. * NOT_FOUND: the job cannot be found. """ res = self.services.http_get("status/{}".format(jobid), frmt="txt", headers={ "User-Agent": self.services.getUserAgent(), "accept": "text/plain" }) return res def get_result_types(self, jobid): """ Get available result types for a finished job. :param str jobid: a job identifier returned by :meth:`run`. :param bool verbose: print the identifiers together with their label, mediaTypes, description and filesuffix. :return: A dictionary, which keys correspond to the identifiers. Each identifier is itself a dictionary containing the label, description, file suffix and mediaType of the identifier. """ if self.get_status(jobid) != 'FINISHED': self.services.logging.warning( "waiting for the job to be finished. May take a while") self.wait(jobid, verbose=False) url = 'resulttypes/' + jobid res = self.services.http_get(url, frmt="json", headers={ "User-Agent": self.services.getUserAgent(), "accept": "application/json" }) return [x["identifier"] for x in res['types']] def get_result(self, jobid, result_type): """ Get the job result of the specified type. :param str jobid: a job identifier returned by :meth:`run`. :param str result_type: type of result to retrieve. See :meth:`getResultTypes`. The output from the tool itself. Use the 'format' parameter to retireve the output in different formats, the 'compressed' parameter to retrieve the xml output in compressed form. Format options:: 0 = pairwise, 1 = query-anchored showing identities, 2 = query-anchored no identities, 3 = flat query-anchored showing identities, 4 = flat query-anchored no identities, 5 = XML Blast output, 6 = tabular, 7 = tabular with comment lines, 8 = Text ASN.1, 9 = Binary ASN.1, 10 = Comma-separated values, 11 = BLAST archive format (ASN.1). See NCBI Blast documentation for details. Use the 'compressed' parameter to return the XML output in compressed form. e.g. '?format=5&compressed=true'. """ if self.get_status(jobid) != 'FINISHED': self.services.logging.warning( "waiting for the job to be finished. May take a while") self.wait(jobid) if self.get_status(jobid) != "FINISHED": raise ValueError("job is not finished") url = 'result/' + jobid + '/' + result_type if result_type in ['out', "error", "sequence", "ids"]: res = self.services.http_get(url, frmt="txt", headers={ "User-Agent": self.services.getUserAgent(), "accept": "text/plain" }) elif result_type in ['xml']: res = self.services.http_get(url, frmt="xml", headers={ "User-Agent": self.services.getUserAgent(), "accept": "text/plain" }) return res def wait(self, jobId): """This function checks the status of a jobid while it is running :param str jobid: a job identifier returned by :meth:`run`. :param int checkInterval: interval between requests in seconds. """ if self.checkInterval < 1: raise ValueError( "checkInterval must be positive and less than a second") result = 'PENDING' while result == 'RUNNING' or result == 'PENDING': result = self.get_status(jobId) if result == 'RUNNING' or result == 'PENDING': time.sleep(self.checkInterval) return result def _get_database(self): return self.get_parameter_details("database") databases = property(_get_database, doc=r"""Returns accepted databases.""")
class MUSCLE(): """Interface to the `MUSCLE <http://www.ebi.ac.uk/Tools/webservices/services/msa/muscle_rest>`_ service. :: >>> from bioservices import * >>> m = MUSCLE(verbose=False) >>> sequencesFasta = open('filename','r') >>> jobid = n.run(frmt="fasta", sequence=sequencesFasta.read(), email="name@provider") >>> s.getResult(jobid, "out") .. warning:: It is very important to provide a real e-mail address as your job otherwise very likely will be killed and your IP, Organisation or entire domain black-listed. Here is another similar example but we use :class:`~bioservices.uniprot.UniProt` class provided in bioservices to fetch the FASTA sequences:: >>> from bioservices import UniProt, MUSCLE >>> u = UniProt(verbose=False) >>> f1 = u.get_fasta("P18413") >>> f2 = u.get_fasta("P18412") >>> m = MUSCLE(verbose=False) >>> jobid = m.run(frmt="fasta", sequence=f1+f2, email="name@provider") >>> m.getResult(jobid, "out") """ def __init__(self, verbose=False): url = "http://www.ebi.ac.uk/Tools/services/rest/muscle" self.services = REST(name='MUSCLE', url=url, verbose=verbose) self._parameters = None self._parametersDetails = {} self._headers = { "User-Agent": self.services.getUserAgent(), "accept": "application/json" } def get_parameters(self): """List parameter names. :returns: An XML document containing a list of parameter names. :: >>> from bioservices import muscle >>> n = muscle.Muscle() >>> res = n.get_parameters() >>> [x.text for x in res.findAll("id")] .. seealso:: :attr:`parameters` to get a list of the parameters without need to process the XML output. """ res = self.services.http_get("parameters", frmt="json", headers=self._headers) return res['parameters'] def _get_parameters(self): if self._parameters: return self._parameters else: # on 2 lines in case it fails, self._parameters remaisn None res = self.get_parameters() self._parameters = res return self._parameters parameters = property(_get_parameters) def get_parameter_details(self, parameterId): """Get detailed information about a parameter. :returns: An XML document providing details about the parameter or a list of values that can take the parameters if the XML could be parsed. For example:: >>> n.get_parameter_details("format") """ if parameterId not in self.parameters: raise ValueError( "Invalid parameterId provided(%s). See parameters attribute" % parameterId) if parameterId not in self._parametersDetails.keys(): request = "parameterdetails/" + parameterId res = self.services.http_get(request, frmt="json", headers=self._headers) self._parametersDetails[parameterId] = res return res def run(self, frmt=None, sequence=None, tree="none", email=None): """ Submit a job with the specified parameters. .. python ncbiblast_urllib2.py -D ENSEMBL --email "*****@*****.**" --sequence .. MDSTNVRSGMKSRKKKPKTTVIDDDDDCMTCSACQSKLVKISDITKVSLDYINTMRGNTLACAACGSSLKLLNDFAS .. --program blastp --database uniprotkb .. rubric:: Compulsary arguments :param str frmt: input format (e.g., fasta) :param str sequence: query sequence. The use of fasta formatted sequence is recommended. :param str tree: tree type ('none','tree1','tree2') :param str email: a valid email address. Will be checked by the service itself. :return: A jobid that can be analysed with :meth:`getResult`, :meth:`getStatus`, ... The up to data values accepted for each of these parameters can be retrieved from the :meth:`get_parameter_details`. For instance,:: from bioservices import MUSCLE m = MUSCLE() m.parameterDetails("tree") Example:: jobid = m.run(frmt="fasta", sequence=sequence_example, email="*****@*****.**") frmt can be a list of formats:: frmt=['fasta','clw','clwstrict','html','msf','phyi','phys'] The returned object is a jobid, which status can be checked. It must be finished before analysing/geeting the results. .. seealso:: :meth:`getResult` """ # There are compulsary arguments: if frmt is None or sequence is None or email is None: raise ValueError("frmt, sequence and email must be provided") # Here, we will check the arguments values (not the type) # Arguments will be checked by the service itself but if we can # catch some before, it is better # FIXME: return parameters from server are not valid self.services.devtools.check_param_in_list( frmt, ['fasta', 'clw', 'clwstrict', 'html', 'msf', 'phyi', 'phys']) self.services.devtools.check_param_in_list(tree, ['none', 'tree1', 'tree2']) # parameter structure params = {'format': frmt, 'sequence': sequence, 'email': email} # headers is muscle is not required. If provided # by the default values from bioservices, it does not # work. headers = {} # IMPORTANT: use data parameter, not params !!! res = self.services.http_post("run", data=params, headers={ "User-Agent": self.services.getUserAgent(), "accept": "text/plain" }) return res def get_status(self, jobid): """Get status of a submitted job :param str jobid: :param str jobid: a job identifier returned by :meth:`run`. :return: A string giving the jobid status (e.g. FINISHED). The values for the status are: * RUNNING: the job is currently being processed. * FINISHED: job has finished, and the results can then be retrieved. * ERROR: an error occurred attempting to get the job status. * FAILURE: the job failed. * NOT_FOUND: the job cannot be found. """ res = self.services.http_get("status/{}".format(jobid), frmt="txt", headers={ "User-Agent": self.services.getUserAgent(), "accept": "text/plain" }) return res def get_result_types(self, jobid): """ Get available result types for a finished job. :param str jobid: a job identifier returned by :meth:`run`. :param bool verbose: print the identifiers together with their label, mediaTypes, description and filesuffix. :return: A dictionary, which keys correspond to the identifiers. Each identifier is itself a dictionary containing the label, description, file suffix and mediaType of the identifier. """ if self.get_status(jobid) != 'FINISHED': self.logging.warning( "waiting for the job to be finished. May take a while") self.wait(jobid, verbose=False) url = 'resulttypes/' + jobid res = self.services.http_get(url, frmt="json", headers={ "User-Agent": self.services.getUserAgent(), "accept": "application/json" }) return [x["identifier"] for x in res['types']] def get_result(self, jobid, result_type): """ Get the job result of the specified type. :param str jobid: a job identifier returned by :meth:`run`. :param str resultType: type of result to retrieve. See :meth:`getResultTypes`. """ if self.get_status(jobid) != 'FINISHED': #pragma: no cover self.services.logging.warning( "waiting for the job to be finished. May take a while") self.wait(jobid, verbose=False) if self.get_status(jobid) != "FINISHED": #pragma: no cover raise ValueError("job is not finished") assert result_type in self.get_result_types(jobid) url = '/result/' + jobid + '/' + result_type if result_type in ['out', 'sequence', "aln-fasta", "pim", "phylotree"]: frmt = "txt" res = self.services.http_get(url, frmt=frmt, headers={ "User-Agent": self.services.getUserAgent(), "accept": "application/json" }) return res def wait(self, jobId, checkInterval=5, verbose=True): """This function checks the status of a jobid while it is running :param str jobid: a job identifier returned by :meth:`run`. :param int checkInterval: interval between requests in seconds. """ if checkInterval < 1: #prgma: no cover raise ValueError( "checkInterval must be positive and less than minute") result = 'PENDING' while result == 'RUNNING' or result == 'PENDING': result = self.get_status(jobId) if verbose: # required from __future__ import print_function print("WARNING: ", jobId, " is ", result, file=sys.stderr) if result == 'RUNNING' or result == 'PENDING': time.sleep(checkInterval) return result
class PDB(): """Interface to `PDB <http://search.rcsb.org/>`_ service (new API Jan 2021) With the new API, one method called :meth:`~bioservices.pdb.PDB.search` is provided by PDB. To perform a search you need to define a query. Here is an example .. doctest:: >>> from bioservices import PDB >>> s = PDB() >>> query = {"query": ... {"type": "terminal", ... "service": "text", ... "parameters": { ... "value": "thymidine kinase" ... } ... }, ... "return_type": "entry"} >>> res = s.search(query, return_type=return_type) .. note:: as of December 2020, a new API has be set up by PDB. some prevous functionalities such as return list of Ligand are not supported anymore (Jan 2021). However, many more powerful searches as available. I encourage everyone to look at the PDB page for complex examples: http://search.rcsb.org/#examples As mentionnaed above, the PDB service provide one method called search available in :meth:`~bioservices.pdb.PDB.search`. We will not cover all the power and capability of this search function. User should refer to the official PDB help for that. Yet, given examples from PDB should all work with this method. When possible, we will add convenient aliases function in this class. For now we have for example the :meth:`~bioservices.pdb.PDB.get_current_ids` and :meth:`~bioservices.pdb.PDB.get_similarity_sequence` that users may find useful. The main idea behind the PDB API is to create queries that can access to different type of services. A query will need to at least two keys: - **query** - **return_type** Consider this basic example that searches for the text *thymidine kinase*:: { "query": { "type": "terminal", "service": "text", "parameters": { "value": "thymidine kinase" } }, "return_type": "entry" } Here the query is defined by a **query** and a **return_type** indeed. The return type is a simple value such as **entry**. The query itself is composed of 3 pairs of key/value. Here we have the type service and parameters as defined below. The query can have several fields: - **type**: the clause type can be either **terminal** or **group** - **terminal**: performs an atomic search operation, e.g. searches for a particular value in a particular field. - **group**: wraps other terminal or group nodes and is used to combine multiple queries in a logical fashion. - **service**: - **text**: linguistic searches against textual annotations. - **sequence**: uses MMSeq2 to perform sequence matching searches (blast-like). following targets that are available: - pdb_protein_sequence, - pdb_dna_sequence, - pdb_na_sequence - **seqmotif**: performs short motif searches against nucleotide or protein sequences using 3 different inputs: - simple (e.g., CXCXXL) - prosite (e.g., C-X-C-X(2)-[LIVMYFWC]) - regex (e.g., CXCX{2}[LIVMYFWC]) - **structure**: searches matching a global 3D shape of assemblies or chains of a given entry (identified by PDB ID), in either strict (strict_shape_match) or relaxed (relaxed_shape_match) modes - strucmotif: Performs structural motif searches on all available PDB structures. - chemical: queries of small-molecule constituents of PDB structures, based on chemical formula and chemical structure. Queries for matching and similar chemical structures can be performed using SMILES and InChI descriptors as search targets. - graph-strict: atom type, formal charge, bond order, atom and bond chirality, aromatic assignment are used as matching criteria for this search type. - graph-relaxed: atom type, formal charge and bond order are used as matching criteria for this search type. - graph-relaxed-stereo: atom type, formal charge, bond order, atom and bond chirality are used as matching criteria for this search type. - fingerprint-similarity: Tanimoto similarity is used as the matching criteria Concerning the **return_type** key, it can be one of : - entry: a list of PDB IDs. - assembly: list of PDB IDs appended with assembly IDs in the format of a [pdb_id]-[assembly_id], corresponding to biological assemblies. - polymer_entity: list of PDB IDs appended with entity IDs in the format of a [pdb_id]_[entity_id], corresponding to polymeric molecular entities. - non_polymer_entity: list of PDB IDs appended with entity IDs in the format of a [pdb_id]_[entity_id], corresponding to non-polymeric entities (or ligands). - polymer_instance: list of PDB IDs appended with asym IDs in the format of a [pdb_id].[asym_id], corresponding to instances of certain polymeric molecular entities, also known as chains. **Optional arguments** There are many optional arguments. Let us see a couple of them. Pagination can be set (default is 10 entries) using the **request_options** (optional) key. Consider this query example:: { "query": { "type": "terminal", "service": "text", "parameters": { "attribute": "rcsb_polymer_entity.formula_weight", "operator": "greater", "value": 500 } }, "request_options": { "pager": { "start": 0, "rows": 100 } }, "return_type": "polymer_entity" } Here, the query searches for the polymer_entity that have a formula weight above 500. Withe request_options pager set to 100, we will get the first 100 hits. To return all hits, set this field in the request_options:: "return_all_hits": true Coming back at the first basic example, we can reuse it to illustrate how to refine the search using attribute and operators:: { "query": { "type": "terminal", "service": "text", "parameters": { "value": "thymidine kinase", "attribute": "exptl.method", "operator": "exact_match", } }, "return_type": "entry" } All valid combo of operators and attributes can be found here: http://search.rcsb.org/search-attributes.html For instance, in the example above only in, exact_match and exists can be used with exptl.method attribute. This is not checked in bioservices. Sorting is determined by the sort object in the request_options context. It allows you to add one or more sorting conditions to control the order of the search result hits. The sort operation is defined on a per field level, with special field name for score to sort by score (the default)< By default sorting is done in descending order ("desc"). The sort can be reversed by setting direction property to "asc". This example demonstrates how to sort the search results by release date:: { "query": { "type": "terminal", "service": "text", "parameters": { "attribute": "struct.title", "operator": "contains_phrase", "value": "\"hiv protease\"" } }, "request_options": { "sort": [ { "sort_by": "rcsb_accession_info.initial_release_date", "direction": "desc" } ] }, "return_type": "entry" } Again, many more complex examples can be found on PDB page. """ _url = "http://search.rcsb.org/rcsbsearch/v1/" def __init__(self, verbose=False, cache=False): """.. rubric:: Constructor :param bool verbose: prints informative messages (default is off) """ self.services = REST(name="PDB", verbose=verbose, cache=cache, url_defined_later=True) self.services.url = PDB._url def search(self, query, request_options=None, request_info=None, return_type=None): """search request represented as a JSON object. This is the only function in PDB API. You should be able to perform any valid PDB searches here (see the :class:`bioservices.pdb.PDB` documentation for details. Note, however, that we have aliases methods in BioServices that will be added on demand for common searches. :param str query: the search expression. Can be omitted if, instead of IDs retrieval, facets or count operation should be performed. In this case the request must be configured via the request_options context. :param str request_options: (optional) controls various aspects of the search request including pagination, sorting, scoring and faceting. :param str request_info: additional information about the query, e.g. query_id. (optional) :param str return_type: type of results to return. :return: json results You must define a query as defined in the PDB web page. For example the following query search for macromolecular PDB entities that share 90% sequence identity with GTPase HRas protein from Gallus gallus (Chicken):: query = { "query": { "type": "terminal", "service": "sequence", "parameters": { "evalue_cutoff": 1, "identity_cutoff": 0.9, "target": "pdb_protein_sequence", "value": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLPARTVETRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMNCKCVIS" } }, "request_options": { "scoring_strategy": "sequence" }, "return_type": "polymer_entity" } What is important is that the dictionary called **query** contains 2 compulsary keys namely **query** and **return_type**. The two other optional keys are **request_options** and **return_info** You would then call the PDB search as follows:: from bioservices import PDB p = PDB() results = p.search(query) Now, in BioServices, you can also decompose the query as follows:: query = { "type": "terminal", "service": "sequence", "parameters": { "evalue_cutoff": 1, "identity_cutoff": 0.9, "target": "pdb_protein_sequence", "value": "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLPARTVETRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMNCKCVIS" }} request_options = { "scoring_strategy": "sequence"} return_type= "polymer_entity" and then use PDB search again:: from bioservices import PDB p = PDB() results = p.search(query, request_options=request_options, return_type=return_type) or even simpler for the Pythonic lovers:: results = p.search(**query) """ if "query" in query: pass else: query = {"query": query} if request_options: query['request_options'] = request_options if request_info: query['request_info'] = request_info if return_type: query['return_type'] = return_type if 'return_type' not in query: #pragma: no cover raise ValueError("Yourr query must have a return_type key") print(query) res = self.services.http_post("query", frmt="json", json=query) return res def get_current_ids(self): """Get a list of all current PDB IDs.""" # first query returns 10 entries by default request_options = {"return_all_hits": True} # second requests all entries res = self.search(query={ 'type': 'terminal', 'service': 'text' }, request_options=request_options, return_type="entry") identifiers = [x['identifier'] for x in res['result_set']] return identifiers def get_similarity_sequence(self, seq): """Search of seauence similarity search with protein sequence seq = "VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTAVAHVDDMPNAL" results = p.get_similarity_sequence(seq) """ res = self.search({ "query": { "type": "terminal", "service": "sequence", "parameters": { "target": "pdb_protein_sequence", "value": seq } }, "return_type": "polymer_entity" }) return res