Exemplo n.º 1
0
    def load_uniprot_mappings(self, ac_types=None, bi=False, ncbi_tax_id=None):

        ncbi_tax_id = self.get_tax_id(ncbi_tax_id)
        tables = self.tables[ncbi_tax_id]
        ac_types = ac_types if ac_types is not None else self.name_types.keys()
        # creating empty MappingTable objects:
        for ac_typ in ac_types:
            tables[(ac_typ, 'uniprot')] = MappingTable(ac_typ,
                                                       'uniprot',
                                                       'protein',
                                                       ac_typ,
                                                       None,
                                                       ncbi_tax_id,
                                                       None,
                                                       log=self.ownlog)
        # attempting to load them from Pickle
        i = 0
        for ac_typ in ac_types:
            md5ac = common.md5((ac_typ, 'uniprot', bi, ncbi_tax_id))
            cachefile = os.path.join('cache', md5ac)
            if self.cache and os.path.isfile(cachefile):
                tables[(ac_typ, 'uniprot')].mapping = \
                    pickle.load(open(cachefile, 'rb'))
                ac_types.remove(ac_typ)
                tables[(ac_typ, 'uniprot')].mid = md5ac
        # loading the remaining from the big UniProt mapping file:
        if len(ac_types) > 0:
            url = urls.urls['uniprot_idmap_ftp']['url']
            c = curl.Curl(url, silent=False, large=True)

            prg = progress.Progress(c.size, "Processing ID conversion list",
                                    99)
            for l in c.result:
                prg.step(len(l))
                l = l.decode('ascii').strip().split('\t')
                for ac_typ in ac_types:
                    if len(l) > 2 and self.name_types[ac_typ] == l[1]:
                        other = l[2].split('.')[0]
                        if l[2] not in tables[(ac_typ,
                                               'uniprot')].mapping['to']:
                            tables[(ac_typ,
                                    'uniprot')].mapping['to'][other] = []
                        tables[(ac_typ, 'uniprot')].mapping['to'][other].\
                            append(l[0].split('-')[0])
                        if bi:
                            uniprot = l[0].split('-')[0]
                            if uniprot not in tables[(ac_typ, 'uniprot')].\
                                    mapping['from']:
                                tables[(ac_typ, 'uniprot')].\
                                    mapping['from'][uniprot] = []
                            tables[(ac_typ, 'uniprot')].mapping['from'][uniprot].\
                                append(other)
            prg.terminate()
            if self.cache:
                for ac_typ in ac_types:
                    md5ac = common.md5((ac_typ, bi))
                    cachefile = os.path.join('cache', md5ac)
                    pickle.dump(tables[(ac_typ, 'uniprot')].mapping,
                                open(cachefile, 'wb'))
Exemplo n.º 2
0
    def load(self, key):

        cachefile = common.md5(json.dumps(key))
        cachefile = os.path.join(self.cachedir, cachefile)

        if os.path.exists(cachefile):

            self.lists[key] = pickle.load(open(cachefile, 'rb'))

            self._log('Reference list for ID type `%s` for organism `%u` '
                      'has been loaded from `%s`.' % (key + (cachefile, )))

        else:

            self.lists[key] = self._load(key)
            pickle.dump(self.lists[key], open(cachefile, 'wb'))
            self._log('Reference list for ID type `%s` for organism `%u` '
                      'has been saved to `%s`.' % (key + (cachefile, )))
Exemplo n.º 3
0
    def __init__(self,
                 one,
                 two,
                 typ,
                 source,
                 param,
                 ncbi_tax_id,
                 mysql=None,
                 log=None,
                 cache=False,
                 cachedir='cache',
                 uniprots=None):
        '''
        When initializing ID conversion tables for the first time
        data is downloaded from UniProt and read into dictionaries.
        It takes a couple of seconds. Data is saved to pickle 
        dumps, this way later the tables load much faster.
        '''

        self.param = param
        self.one = one
        self.two = two
        self.typ = typ
        self.maxlOne = None
        self.maxlTwo = None
        self.mysql = mysql
        self.cache = cache
        self.cachedir = cachedir
        self.mapping = {"to": {}, "from": {}}

        if log.__class__.__name__ != 'logw':
            self.session = common.gen_session_id()
            self.ownlog = logn.logw(self.session, 'INFO')
        else:
            self.ownlog = log

        if param is not None:

            self.mid = common.md5((one, two, self.param.bi, ncbi_tax_id))
            md5param = common.md5(json.dumps(self.param.__dict__))
            self.cachefile = os.path.join(self.cachedir, md5param)

            if self.cache and os.path.isfile(self.cachefile):
                self.mapping = pickle.load(open(self.cachefile, 'rb'))

            elif len(self.mapping['to']) == 0 or (param.bi and len(
                    self.mapping['from']) == 0):

                if os.path.exists(self.cachefile):
                    os.remove(self.cachefile)
                if source == "mysql":
                    self.read_mapping_mysql(param, ncbi_tax_id)
                elif source == "file":
                    self.read_mapping_file(param, ncbi_tax_id)
                elif source == "pickle":
                    self.read_mapping_pickle(param, ncbi_tax_id)
                elif source == "uniprot":
                    self.read_mapping_uniprot(param, ncbi_tax_id)
                elif source == "uniprotlist":
                    self.read_mapping_uniprot_list(param,
                                                   uniprots=uniprots,
                                                   ncbi_tax_id=ncbi_tax_id)

                if len(self.mapping['to']) and (not param.bi
                                                or len(self.mapping['from'])):
                    pickle.dump(self.mapping, open(self.cachefile, 'wb'))