Exemplo n.º 1
0
def load_x(idx, fieldname, cvt_fn=None):
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1):
        ld = listitems(ld, *(2,19,idx))    # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld,
                                       dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value}
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Exemplo n.º 2
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k, v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k, v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li,
                               0,
                               alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")
    for line in tabfile_feeder(
            os.path.join(ensembl_dir, "gene_ensembl__translation__main.txt")):
        _, ensid, transid, _ = line
        if transid in exacs:
            data = exacs.pop(
                transid)  # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid, [ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Exemplo n.º 3
0
def load_broadinstitute_exac():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    t0 = time.time()
    exacs = load_broadinstitute_exac_all()
    for k,v in load_broadinstitute_exac_nontcga().items():
        try:
            exacs[k]["exac"]["nontcga"] = v["exac"]["nontcga"]
        except KeyError:
            exacs[k] = v
    for k,v in load_broadinstitute_exac_nonpsych().items():
        try:
            exacs[k]["exac"]["nonpsych"] = v["exac"]["nonpsych"]
        except KeyError:
            exacs[k] = v

    logging.info("Convert transcript ID to EntrezID")
    import dataload.sources.ensembl.ensembl_base as ensembl_base
    ensembl_parser = ensembl_base.EnsemblParser()
    ensembl_parser._load_ensembl2entrez_li()
    ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
    ensembl_dir = get_data_folder("ensembl")  
    for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
        _,ensid,transid,_ = line
        if transid in exacs:
            data = exacs.pop(transid) # pop so no-match means no data in the end
            for entrezid in ensembl2entrez.get(ensid,[ensid]):
                exacs[entrezid] = data

    load_done('[%d, %s]' % (len(exacs), timesofar(t0)))

    return exacs
Exemplo n.º 4
0
 def _cvt(pli):
     _d = list2dict(pli, 2)
     _d = value_convert(_d, _inner_cvt)
     for p_source in _d:
         if isinstance(_d[p_source], list):
             _d[p_source].sort()
     return {'pathway': _d}
Exemplo n.º 5
0
 def _cvt(pli):
     _d = list2dict(pli, 2)
     _d = value_convert(_d, _inner_cvt)
     for p_source in _d:
         if isinstance(_d[p_source], list):
             _d[p_source].sort(key=lambda e: e["id"])
     return {'pathway': _d}
Exemplo n.º 6
0
def load_uniprot():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        ld = listitems(ld,
                       *(0, 1, 2,
                         18))  # UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene)
        for value in dupline_seperator(
                dupline=ld,
                dup_idx=[2, 3
                         ],  # GeneID and EnsemblID columns may have duplicates
                dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xli.append(value)

    ensembl2geneid = list2dict([(x[3], x[2])
                                for x in xli if x[2] != '' and x[3] != ''],
                               0,
                               alwayslist=True)
    xli2 = []
    for uniprot_acc, section, entrez_id, ensembl_id in xli:
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            entrez_id = ensembl2geneid.get(ensembl_id, None)
            if entrez_id:
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            else:
                #otherwise, just use ensembl_id
                xli2.append((uniprot_acc, section, ensembl_id))

    gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
    gene2uniprot = value_convert(gene2uniprot,
                                 _dict_convert,
                                 traverse_list=False)
    load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0)))

    return gene2uniprot
Exemplo n.º 7
0
 def _load_ensembl2entrez_li(self):
     ensembl2entrez_li = loadobj(("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs')
     #filter out those deprecated entrez gene ids
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez_li = [(ensembl_id, self._entrez_geneid_d[int(entrez_id)]) for (ensembl_id, entrez_id) in ensembl2entrez_li
                          if int(entrez_id) in self._entrez_geneid_d]
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez = list2dict(ensembl2entrez_li, 0)
     self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
Exemplo n.º 8
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        #Now make a dictionary indexed by entrez gene id
        print('# of ensembl IDs in total: %d' %
              len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        #all genes with matched entrez
        def _fn(eid, taxid=None):
            d = copy.copy(ensembl2x.get(
                eid, {}))  # need to make a copy of the value here.
            return d  # otherwise, it will cause issue when multiple entrezgene ids
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        #add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                #if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
Exemplo n.º 9
0
def load_x(idx, fieldname, cvt_fn=None):
    '''idx is 0-based column number'''
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        ld = listitems(ld, *(2, 19, idx))  # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld, dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli
                                            if x[0] != '' and x[1] != '']),
                               0,
                               alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {
        fieldname: sorted(value) if isinstance(value, list) else value
    }
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Exemplo n.º 10
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        #Now make a dictionary indexed by entrez gene id
        print '# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))
        print '# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))
        print '# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))

        #all genes with matched entrez
        def _fn(eid, taxid=None):
            d = copy.copy(ensembl2x.get(eid, {}))    #need to make a copy of the value here.
            return d                                    #otherwise, it will cause issue when multiple entrezgene ids
                                                        #match the same ensembl gene, for example,
                                                        #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        #add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        doc_li = []
        for id in data:
            if type(data[id]) is types.DictType:
                _doc = dict_nodup(data[id], sort=True)
            else:
                #if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
Exemplo n.º 11
0
def _dict_convert(uniprot_li):
    '''
    convert [(u'E7ESI2', 'TrEMBL'), (u'P24941', 'Swiss-Prot'),
             (u'G3V5T9', 'TrEMBL'), (u'G3V317', 'TrEMBL')] into
    {'Swiss-Prot': u'P24941',
     'TrEMBL': [u'E7ESI2', u'G3V5T9', u'G3V317']}
    '''
    _dict = list2dict(uniprot_li, 1)
    for k, v in _dict.items():
        if isinstance(v, list):
            _dict[k] = sorted(v)
    return {'uniprot': _dict}
Exemplo n.º 12
0
 def _load_ensembl2entrez_li(self):
     ensembl2entrez_li = loadobj(
         ("ensembl_gene__2entrezgene_list.pyobj", self.src), mode='gridfs')
     #filter out those deprecated entrez gene ids
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez_li = [(ensembl_id,
                           self._entrez_geneid_d[int(entrez_id)])
                          for (ensembl_id, entrez_id) in ensembl2entrez_li
                          if int(entrez_id) in self._entrez_geneid_d]
     logging.info(len(ensembl2entrez_li))
     ensembl2entrez = list2dict(ensembl2entrez_li, 0)
     self._idmapping_d_cache['ensembl_gene'] = ensembl2entrez
Exemplo n.º 13
0
def _dict_convert(uniprot_li):
    '''
    convert [(u'E7ESI2', 'TrEMBL'), (u'P24941', 'Swiss-Prot'),
             (u'G3V5T9', 'TrEMBL'), (u'G3V317', 'TrEMBL')] into
    {'Swiss-Prot': u'P24941',
     'TrEMBL': [u'E7ESI2', u'G3V5T9', u'G3V317']}
    '''
    _dict = list2dict(uniprot_li, 1)
    for k, v in _dict.items():
        if type(v) is types.ListType:
            _dict[k] = sorted(v)
    return {'uniprot': _dict}
Exemplo n.º 14
0
def load_uniprot():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1):
        ld = listitems(ld, *(0,1,2,19))    #UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene)
        for value in dupline_seperator(dupline=ld,
                                       dup_idx=[2,3],   #GeneID and EnsemblID columns may have duplicates
                                       dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xli.append(value)

    ensembl2geneid = list2dict([(x[3], x[2]) for x in xli if x[2]!='' and x[3]!=''], 0, alwayslist=True)
    xli2 = []
    for uniprot_acc, section, entrez_id, ensembl_id in xli:
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            entrez_id = ensembl2geneid.get(ensembl_id, None)
            if entrez_id:
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            else:
                #otherwise, just use ensembl_id
                xli2.append((uniprot_acc, section, ensembl_id))

    gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
    gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0)))

    return gene2uniprot
Exemplo n.º 15
0
def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
        load_done()
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort(key=lambda e: e["id"])
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)
    load_done('[%d, %s]' % (len(_out), timesofar(t0)))

    return _out
Exemplo n.º 16
0
def load_cpdb(__metadata__):
    # only import pathways from these sources
    PATHWAY_SOURCES_INCLUDED = __metadata__['pathway_sources_included']
    VALID_COLUMN_NO = 4

    t0 = time.time()
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATA_FILES = []
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_mouse.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_yeast.tab'))
    DATA_FILES.append(os.path.join(DATA_FOLDER, 'CPDB_pathways_genes_human.tab'))

    _out = []
    for DATA_FILE in DATA_FILES:
        load_start(DATA_FILE)
        for ld in tabfile_feeder(DATA_FILE, header=1, assert_column_no=VALID_COLUMN_NO):
            p_name, p_id, p_source = ld[:3]
            p_source = p_source.lower()
            if p_source == 'kegg' and p_id.startswith('path:'):
                p_id = p_id[5:]
            if p_source in PATHWAY_SOURCES_INCLUDED:
                genes = ld[-1].split(",")
                for gene in genes:
                    _out.append((gene, p_name, p_id, p_source))
        load_done()
    _out = list2dict(_out, 0, alwayslist=True)

    def _inner_cvt(p):
        p_name, p_id = p
        _d = {'name': p_name}
        if p_id != 'None':
            _d['id'] = p_id
        return _d

    def _cvt(pli):
        _d = list2dict(pli, 2)
        _d = value_convert(_d, _inner_cvt)
        for p_source in _d:
            if isinstance(_d[p_source], list):
                _d[p_source].sort()
        return {'pathway': _d}

    _out = dict_convert(_out, valuefn=_cvt)
    load_done('[%d, %s]' % (len(_out), timesofar(t0)))

    return _out
Exemplo n.º 17
0
    def _db_upload(self, doc_li, step=10000, verbose=True):
        import time
        from biothings.utils.common import timesofar
        from utils.dataload import list2dict, list_itemcnt, listsort

        output = []
        t0 = time.time()
        for i in range(0, len(doc_li), step):
            output.extend(self.target_db.update(doc_li[i:i+step]))
            if verbose:
                print('\t%d-%d Done [%s]...' % (i+1, min(i+step, len(doc_li)), timesofar(t0)))

        res = list2dict(list_itemcnt([x[0] for x in output]), 0)
        print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0)))
        res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True)
        print('\n'.join(['\t%s\t%d' % x for x in res[:10]]))
        if len(res) > 10:
            print("\t%d lines omitted..." % (len(res)-10))
Exemplo n.º 18
0
    def _db_upload(self, doc_li, step=10000, verbose=True):
        import time
        from utils.common import timesofar
        from utils.dataload import list2dict, list_itemcnt, listsort

        output = []
        t0 = time.time()
        for i in range(0, len(doc_li), step):
            output.extend(self.target_db.update(doc_li[i:i + step]))
            if verbose:
                print('\t%d-%d Done [%s]...' % (i + 1, min(i + step, len(doc_li)), timesofar(t0)))

        res = list2dict(list_itemcnt([x[0] for x in output]), 0)
        print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0)))
        res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True)
        print('\n'.join(['\t%s\t%d' % x for x in res[:10]]))
        if len(res) > 10:
            print("\t%d lines omitted..." % (len(res) - 10))
Exemplo n.º 19
0
def load_exons_for_species(species, exons_key='exons'):
    refflat_file = os.path.join(DATA_FOLDER, species, 'database/refFlat.txt.gz')
    reflink_file = os.path.join(DATA_FOLDER, species, 'database/refLink.txt.gz')

    load_start(refflat_file)
    t0 = time.time()

    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    ref2exons = []
    for ld in tabfile_feeder(refflat_file, header=0):
        refseq = ld[1]
        chr = ld[2]
        if chr.startswith('chr'):
            chr = chr[3:]
        exons = zip([int(x) for x in ld[9].split(',') if x],
                    [int(x) for x in ld[10].split(',') if x])
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.append((refseq, {
            'chr': chr,
            'strand': -1 if ld[3] == '-' else 1,
            'txstart': int(ld[4]),
            'txend': int(ld[5]),
            'cdsstart': int(ld[6]),
            'cdsend': int(ld[7]),
            'exons': exons
        }))
    ref2exons = list2dict(ref2exons, 0)

    gene2exons = {}
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: {refseq: ref2exons[refseq]}}
            else:
                gene2exons[geneid][exons_key][refseq] = ref2exons[refseq]

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Exemplo n.º 20
0
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False):
    '''clean up archive collections in src db, only keep last <kepp_last>
       number of archive.
    '''
    from utils.dataload import list2dict
    from biothings.utils.common import ask

    src = src or get_src_db()

    archive_li = sorted([(coll.split('_archive_')[0], coll)
                         for coll in src.collection_names()
                         if coll.find('archive') != -1])
    archive_d = list2dict(archive_li, 0, alwayslist=1)
    coll_to_remove = []
    for k, v in archive_d.items():
        print(k, end='')
        #check current collection exists
        if src[k].count() > 0:
            cnt = 0
            for coll in sorted(v)[:-keep_last]:
                coll_to_remove.append(coll)
                cnt += 1
            print("\t\t%s archived collections marked to remove." % cnt)
        else:
            print('skipped. Missing current "%s" collection!' % k)
    if len(coll_to_remove) > 0:
        print("%d archived collections will be removed." % len(coll_to_remove))
        if verbose:
            for coll in coll_to_remove:
                print('\t', coll)
        if noconfirm or ask("Continue?") == 'Y':
            for coll in coll_to_remove:
                src[coll].drop()
            print("Done.[%s collections removed]" % len(coll_to_remove))
        else:
            print("Aborted.")
    else:
        print("Nothing needs to be removed.")
Exemplo n.º 21
0
def src_clean_archives(keep_last=1, src=None, verbose=True, noconfirm=False):
    '''clean up archive collections in src db, only keep last <kepp_last>
       number of archive.
    '''
    from utils.dataload import list2dict
    from utils.common import ask

    src = src or get_src_db()

    archive_li = sorted([(coll.split('_archive_')[0], coll) for coll in src.collection_names()
                         if coll.find('archive') != -1])
    archive_d = list2dict(archive_li, 0, alwayslist=1)
    coll_to_remove = []
    for k, v in archive_d.items():
        print k,
        #check current collection exists
        if src[k].count() > 0:
            cnt = 0
            for coll in sorted(v)[:-keep_last]:
                coll_to_remove.append(coll)
                cnt += 1
            print "\t\t%s archived collections marked to remove." % cnt
        else:
            print 'skipped. Missing current "%s" collection!' % k
    if len(coll_to_remove) > 0:
        print "%d archived collections will be removed." % len(coll_to_remove)
        if verbose:
            for coll in coll_to_remove:
                print '\t', coll
        if noconfirm or ask("Continue?") == 'Y':
            for coll in coll_to_remove:
                src[coll].drop()
            print "Done.[%s collections removed]" % len(coll_to_remove)
        else:
            print "Aborted."
    else:
        print "Nothing needs to be removed."