Exemplo n.º 1
0
    def _read_mapping_uniprot_list(self, source, target, ac_list):
        """
        Reads a mapping table from UniProt "upload lists" service.
        """

        url = urls.urls['uniprot_basic']['lists']
        post = {
            'from': source,
            'format': 'tab',
            'to': target,
            'uploadQuery': ' '.join(ac_list)
        }

        c = curl.Curl(url, post=post, large=True, silent=False)

        if c.result is None:
            for i in xrange(3):
                c = curl.Curl(url,
                              post=post,
                              large=True,
                              silent=False,
                              cache=False)
                if c.result is not None:
                    break

            if c.result is None:
                sys.stdout.write('\t:: Error at downloading from UniProt.\n')

        return c.result
Exemplo n.º 2
0
    def query(self, api, param, silent=False, large=False):
        '''
        Retrieves data from the API. 

        @api : str
            Shold be one of the 10 API sections available.
        @param : tuple
            Tuple of the parameters according to the API.
        @large : bool
            Passed to the curl wrapper function. If True, 
            the file will be written to disk, and a file 
            object open for reading is returned; if False,
            the raw data will be returned, in case of JSON,
            converted to python object, in case of XML, as
            a string.
        '''
        url = self.urls[api] % param
        # long timeout is given, because huge files (hundreds MB) take time to
        # load
        c = curl.Curl(
            url,
            req_headers=self.auth,
            silent=silent,
            timeout=1200,
            large=large)
        
        data = c.fileobj
        self.tmp = c
        
        if self.output_format == 'json' and not large:
            self.result = self.get_json(c.result)
        else:
            self.result = c.fileobj
Exemplo n.º 3
0
def get_mirbase_aliases(organism=9606):
    """
    Downloads and processes mapping tables from miRBase.
    """

    if type(organism) in common.charTypes:
        mborganism = organism
    elif organism not in common.mirbase_taxids:
        raise ValueError('Organism not known: %u. Try to pass miRBase '
                         'taxon prefix as string, e.g. `hsa`.' % organism)
    else:
        mborganism = common.mirbase_taxids[organism]

    mat = {}
    mir = {}

    url = urls.urls['mirbase']['aliases']
    c = curl.Curl(url, silent=False, large=True)

    for l in c.result:

        l = l.decode('utf-8').strip().strip(';').split('\t')

        if l[1][:3] != mborganism:
            continue

        d = mat if l[0][:5] == 'MIMAT' else mir

        if l[0] not in d:
            d[l[0]] = set([])

        for m in l[1].split(';'):
            d[l[0]].add(m)

    return mat, mir
Exemplo n.º 4
0
def get_uniprot_sec(organism=9606):
    """
    Downloads and processes the mapping between secondary and
    primary UniProt IDs.
    
    Yields pairs of secondary and primary UniProt IDs.
    
    :param int organism:
        NCBI Taxonomy ID of the organism.
    """

    if organism is not None:
        proteome = uniprot_input.all_uniprots(organism=organism)
        proteome = set(proteome)
    sec_pri = []
    url = urls.urls['uniprot_sec']['url']
    c = curl.Curl(url, silent=False, large=True)

    for line in filter(
            lambda line: len(line) == 2 and
        (organism is None or line[1] in proteome),
            map(lambda i: i[1].decode('utf-8').split(),
                filter(lambda i: i[0] >= 30, enumerate(c.result)))):

        yield line
Exemplo n.º 5
0
 def translate(self, source, target, lst):
     if source == 'inchikey':
         self.inchikey2anything(target, lst)
         return None
     if source == 'smiles':
         self.smiles2chembl(lst)
         return None
     self.result = {}
     source = str(source) if type(source) is int else self.name_dict[source]
     target = str(target) if type(target) is int else self.name_dict[target]
     prg = progress.Progress(
         total=len(lst),
         name='Translating compound identifiers',
         interval=1)
     for comp in lst:
         url = '/'.join([self.url_stem, comp, source, target])
         c = curl.Curl(url, large = False)
         result = c.result
         self.result[comp] = []
         if result is not None:
             data = json.loads(result)
             for d in data:
                 self.result[comp].append(d['src_compound_id'])
         prg.step()
     prg.terminate()
Exemplo n.º 6
0
 def smiles2chembl(self, smiles):
     self.result = {}
     prg = progress.Progress(total=len(smiles),
                             name='Translating SMILEs',
                             interval=1)
     for sml in smiles:
         url = self.chembl_url.format(sml)
         c = curl.Curl(url, large=False)
         result = c.result
         self.result[sml] = []
         if result is not None:
             try:
                 data = json.loads(result)
                 for d in data['compounds']:
                     this_smile = d['smiles']
                     this_chembl = d['chemblId']
                     # if this_smile == sml:
                     self.result[sml].append(this_chembl)
             except ValueError:
                 soup = bs4.BeautifulSoup(result)
                 compounds = soup.find_all('compound')
                 if compounds is not None:
                     for compound in compounds:
                         this_smile = compound.find('smiles').text
                         this_chembl = compound.find('chemblid').text
                         # if this_smile == sml:
                         self.result[sml].append(this_chembl)
         prg.step()
     prg.terminate()
Exemplo n.º 7
0
    def load_uniprot_mappings(self, ac_types=None, bi=False, ncbi_tax_id=None):

        ncbi_tax_id = self.get_tax_id(ncbi_tax_id)
        tables = self.tables[ncbi_tax_id]
        ac_types = ac_types if ac_types is not None else self.name_types.keys()
        # creating empty MappingTable objects:
        for ac_typ in ac_types:
            tables[(ac_typ, 'uniprot')] = MappingTable(ac_typ,
                                                       'uniprot',
                                                       'protein',
                                                       ac_typ,
                                                       None,
                                                       ncbi_tax_id,
                                                       None,
                                                       log=self.ownlog)
        # attempting to load them from Pickle
        i = 0
        for ac_typ in ac_types:
            md5ac = common.md5((ac_typ, 'uniprot', bi, ncbi_tax_id))
            cachefile = os.path.join('cache', md5ac)
            if self.cache and os.path.isfile(cachefile):
                tables[(ac_typ, 'uniprot')].mapping = \
                    pickle.load(open(cachefile, 'rb'))
                ac_types.remove(ac_typ)
                tables[(ac_typ, 'uniprot')].mid = md5ac
        # loading the remaining from the big UniProt mapping file:
        if len(ac_types) > 0:
            url = urls.urls['uniprot_idmap_ftp']['url']
            c = curl.Curl(url, silent=False, large=True)

            prg = progress.Progress(c.size, "Processing ID conversion list",
                                    99)
            for l in c.result:
                prg.step(len(l))
                l = l.decode('ascii').strip().split('\t')
                for ac_typ in ac_types:
                    if len(l) > 2 and self.name_types[ac_typ] == l[1]:
                        other = l[2].split('.')[0]
                        if l[2] not in tables[(ac_typ,
                                               'uniprot')].mapping['to']:
                            tables[(ac_typ,
                                    'uniprot')].mapping['to'][other] = []
                        tables[(ac_typ, 'uniprot')].mapping['to'][other].\
                            append(l[0].split('-')[0])
                        if bi:
                            uniprot = l[0].split('-')[0]
                            if uniprot not in tables[(ac_typ, 'uniprot')].\
                                    mapping['from']:
                                tables[(ac_typ, 'uniprot')].\
                                    mapping['from'][uniprot] = []
                            tables[(ac_typ, 'uniprot')].mapping['from'][uniprot].\
                                append(other)
            prg.terminate()
            if self.cache:
                for ac_typ in ac_types:
                    md5ac = common.md5((ac_typ, bi))
                    cachefile = os.path.join('cache', md5ac)
                    pickle.dump(tables[(ac_typ, 'uniprot')].mapping,
                                open(cachefile, 'wb'))
Exemplo n.º 8
0
    def read_mapping_uniprot(self, param, ncbi_tax_id=None):
        """
        Downloads ID mappings directly from UniProt.
        See the names of possible identifiers here:
        http://www.uniprot.org/help/programmatic_access

        :param UniprotMapping param: UniprotMapping instance
        :param int ncbi_tax_id: Organism NCBI Taxonomy ID.
        """

        ncbi_tax_id = self.get_tax_id(ncbi_tax_id)
        resep = re.compile(r'[\s;]')
        if param.__class__.__name__ != "UniprotMapping":
            self.ownlog.msg(2, "Invalid parameter for read_mapping_uniprot()",
                            'ERROR')
            return {}
        mapping_o = {}
        mapping_i = {}
        scolend = re.compile(r'$;')
        rev = '' if param.swissprot is None \
            else ' AND reviewed:%s' % param.swissprot
        query = 'organism:%u%s' % (int(ncbi_tax_id), rev)
        self.url = urls.urls['uniprot_basic']['url']
        self.post = {
            'query':
            query,
            'format':
            'tab',
            'columns':
            'id,%s%s' %
            (param.field,
             '' if param.subfield is None else '(%s)' % param.subfield)
        }
        self.url = '%s?%s' % (self.url, urllib.urlencode(self.post))
        c = curl.Curl(self.url, silent=False)
        data = c.result
        self.data = data
        data = [[[xx] if param.field == 'protein names' else [
            xxx for xxx in resep.split(scolend.sub('', xx.strip()))
            if len(xxx) > 0
        ] for xx in x.split('\t') if len(xx.strip()) > 0]
                for x in data.split('\n') if len(x.strip()) > 0]
        if len(data) > 0:
            del data[0]
            for l in data:
                if len(l) > 1:
                    l[1] = self.process_protein_name(l[1][0]) \
                        if param.field == 'protein names' else l[1]
                    for other in l[1]:
                        if other not in mapping_o:
                            mapping_o[other] = []
                        mapping_o[other].append(l[0][0])
                        if param.bi:
                            if l[0][0] not in mapping_i:
                                mapping_i[l[0][0]] = []
                            mapping_i[l[0][0]].append(other)
        self.mapping['to'] = mapping_o
        if param.bi:
            self.mapping['from'] = mapping_i
Exemplo n.º 9
0
def get_uniprot_sec(organism=9606):
    if organism is not None:
        proteome = uniprot_input.all_uniprots(organism=organism)
        proteome = set(proteome)
    sec_pri = []
    url = urls.urls['uniprot_sec']['url']
    c = curl.Curl(url, silent=False, large=True)
    data = c.result
    return filter(
        lambda line: len(line) == 2 and
        (organism is None or line[1] in proteome),
        map(lambda i: i[1].decode('utf-8').split(),
            filter(lambda i: i[0] >= 30, enumerate(data))))
Exemplo n.º 10
0
def all_uniprots(organism=9606, swissprot=None):
    rev = '' if swissprot is None else ' AND reviewed:%s' % swissprot
    url = urls.urls['uniprot_basic']['url']
    post = {
        'query': 'organism:%s%s' % (str(organism), rev),
        'format': 'tab',
        'columns': 'id'
    }
    c = curl.Curl(url, post=post, silent=False)
    data = c.result
    return list(
        filter(lambda x: len(x) > 0,
               map(lambda l: l.strip(),
                   data.split('\n')[1:])))
Exemplo n.º 11
0
def _all_uniprots(organism=9606, swissprot=None):

    swissprot = 'yes' if swissprot == True else swissprot
    rev = '' if not swissprot else ' AND reviewed: %s' % swissprot
    url = urls.urls['uniprot_basic']['url']
    get = {
        'query': 'organism:%s%s' % (str(organism), rev),
        'format': 'tab',
        'columns': 'id',
    }
    c = curl.Curl(url, get=get, silent=False)
    data = c.result

    return [l.strip() for l in data.split('\n')[1:] if l.strip()]
Exemplo n.º 12
0
def get_pmid(idList):
    """
    For a list of doi or PMC IDs 
    fetches the corresponding PMIDs.
    """
    if type(idList) in common.simpleTypes:
        idList = [idList]
    url = urls.urls['pubmed-eutils']['conv'] % ','.join(str(i) for i in idList)
    c = curl.Curl(url, silent=True)
    data = c.result
    try:
        js = json.loads(data)
    except:
        js = {}
    return js
Exemplo n.º 13
0
 def inchikey2anything(self, target, lst):
     self.result = {}
     target = str(target) if type(target) is int else self.name_dict[target]
     prg = progress.Progress(
         total=len(lst), name='Translating InChi-Keys', interval=1)
     for inchik in lst:
         url = self.inchi_stem % inchik
         c = curl.Curl(url, large = False)
         result = c.result
         if result is not None:
             data = json.loads(result)
             self.result[inchik] = [
                 d['src_compound_id'] for d in data if d['src_id'] == target
             ]
         prg.step()
     prg.terminate()
Exemplo n.º 14
0
def get_isoforms(organism='H**o sapiens'):
    reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)')
    result = {}
    url = urls.urls['unip_iso']['url']
    c = curl.Curl(url, silent=False)
    data = c.result
    data = read_fasta(data)
    for header, seq in iteritems(data):
        org = reorg.findall(header)
        if len(org) > 0 and org[0] == organism:
            prot = header.split('|')[1].split('-')
            unip = prot[0]
            isof = int(prot[1])
            if unip not in result:
                result[unip] = {}
            result[unip][isof] = seq
    return result
Exemplo n.º 15
0
    def setup_resource(self):
        self.input = self.settings.inFile

        if callable(self.input):
            self.resource = self.input(**self.settings.inputArgs)

        elif isinstance(self.input, common.basestring):
            if hasattr(dataio, self.input):
                self.resource = getattr(dataio,
                                        self.input)(**self.settings.inputArgs)
            elif (os.path.exists(self.input) or curl.is_url(self.input)):
                c = curl.Curl(self.input, **self.settings.curlArgs)
                self.resource = c.result

        elif hasattr(self.input, '__iter__'):
            self.resource = self.input

        else:
            self.resource = []
Exemplo n.º 16
0
 def connectivity_search(self,
                         id_list,
                         id_type,
                         parameters=[1, 0, 0, 0, 0, 1, 0]):
     '''
     [1,0,0,0,0,1,0,  1]
     '''
     '''
     parameters is a list of parameters A-H as described in 
     https://www.ebi.ac.uk/unichem/info/widesearchInfo
     '''
     parameters.append(1)  # H parameter must be 1 to process the result
     parameters = [str(i) for i in parameters]
     self.result = {}
     if id_type == 'inchikey':
         id_type = ''
         method = 'key_search'
     elif id_type == 'smiles':
         self.result = None
         return None
     else:
         id_type = str(
             id_type) if type(id_type) is int else self.name_dict[id_type]
         id_type = '%s/' % id_type
         method = 'cpd_search'
     prg = progress.Progress(total=len(id_list),
                             name='Connectivity search',
                             interval=1)
     for i in id_list:
         prg.step()
         url = self.cpd_search.format(method, i, id_type,
                                      '/'.join(parameters))
         c = curl.Curl(url, large=False)
         result = c.result
         self.result[i] = []
         if result is not None:
             data = json.loads(result)
             for k, v in iteritems(data):
                 for j in range(1, len(v)):
                     self.result[i].append(v[j][0])
         self.result[i] = list(set(self.result[i]))
     prg.terminate()
Exemplo n.º 17
0
Arquivo: seq.py Projeto: kkaris/pypath
def swissprot_seq(organism=9606, isoforms=False):
    """
    Loads all sequences for an organism, optionally
    for all isoforms, by default only first isoform.
    """

    result = {}
    url = urls.urls['uniprot_basic']['url']
    post = {
        'query': 'organism:%s AND reviewed:yes' % str(organism),
        'format': 'tab',
        'columns': 'id,sequence'
    }
    c = curl.Curl(url, post=post, silent=False, timeout=900)
    data = c.result
    data = data.split('\n')
    del data[0]

    for l in data:

        l = l.strip().split('\t')

        if len(l) == 2:
            result[l[0]] = Seq(l[0], l[1])

    if isoforms:

        data = get_isoforms(organism=organism)

        for unip, isoforms in iteritems(data):

            for isof, seq in iteritems(isoforms):

                if unip in result:

                    result[unip].add_seq(seq, isof)

    return result
Exemplo n.º 18
0
def swissprot_seq(organism=9606, isoforms=False):
    taxids = {9606: 'H**o sapiens'}
    result = {}
    url = urls.urls['uniprot_basic']['url']
    post = {
        'query': 'organism:%s AND reviewed:yes' % str(organism),
        'format': 'tab',
        'columns': 'id,sequence'
    }
    c = curl.Curl(url, post=post, silent=False)
    data = c.result
    data = data.split('\n')
    del data[0]
    for l in data:
        l = l.strip().split('\t')
        if len(l) == 2:
            result[l[0]] = se.Seq(l[0], l[1])
    if isoforms:
        data = get_isoforms()
        for unip, isoforms in iteritems(data):
            for isof, seq in iteritems(isoforms):
                if unip in result:
                    result[unip].add_seq(seq, isof)
    return result
Exemplo n.º 19
0
Arquivo: seq.py Projeto: kkaris/pypath
def get_isoforms(organism=9606):
    """
    Loads UniProt sequences for all isoforms.
    """

    if organism in common.phosphoelm_taxids:
        organism = common.phosphoelm_taxids[organism]

    reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)')
    result = {}
    url = urls.urls['unip_iso']['url']
    c = curl.Curl(url, silent=False)
    data = c.result
    data = read_fasta(data)
    for header, seq in iteritems(data):
        org = reorg.findall(header)
        if len(org) > 0 and org[0] == organism:
            prot = header.split('|')[1].split('-')
            unip = prot[0]
            isof = int(prot[1])
            if unip not in result:
                result[unip] = {}
            result[unip][isof] = seq
    return result
Exemplo n.º 20
0
    def ptm_orthology(self):
        """
        Creates an orthology translation dict of phosphosites
        based on phosphorylation sites table from PhosphoSitePlus.
        In the result all PTMs represented by a tuple of the following
        6 elements: UniProt ID, isoform (int), residue one letter code,
        residue number (int), NCBI Taxonomy ID (int), modification type.
        
        """

        self.ptmhomo = {}

        nondigit = re.compile(r'[^\d]+')

        unknown_taxa = set([])

        for typ in common.psite_mod_types:

            groups = {}

            url = urls.urls['psite_%s' % typ[0]]['url']
            c = curl.Curl(url, silent=False, large=True)

            data = c.result

            for _ in xrange(4):
                null = next(data)

            for r in data:

                r = r.decode('utf-8').split('\t')

                if len(r) < 10:

                    continue

                uniprot = r[2]
                isoform = 1 if '-' not in uniprot else int(
                    uniprot.split('-')[1])
                uniprot = uniprot.split('-')[0]
                aa = r[4][0]
                num = int(nondigit.sub('', r[4]))
                if r[6] not in common.taxa:
                    unknown_taxa.add(r[6])
                    continue

                tax = common.taxa[r[6]]
                group = int(r[5])

                this_site = (uniprot, isoform, aa, num, tax, typ[1])

                if group not in groups:
                    groups[group] = set([])

                groups[group].add(this_site)

            for group, sites in iteritems(groups):

                for site1 in sites:

                    for site2 in sites:

                        if site1[4] == site2[4]:

                            continue

                        if site1 not in self.ptmhomo:

                            self.ptmhomo[site1] = {}

                        if site2[4] not in self.ptmhomo[site1]:

                            self.ptmhomo[site1][site2[4]] = set([])

                        self.ptmhomo[site1][site2[4]].add(site2)

        if len(unknown_taxa):
            self._log('Unknown taxa encountered: %s' %
                      (', '.join(sorted(unknown_taxa))))