Пример #1
0
def diff_collections2(b1, b2, result_dir, step=10000):
    '''
    b2 is new collection, b1 is old collection
    '''
    DIFFFILE_PATH = '/home/kevinxin/diff_result/'
    DATA_FOLDER = os.path.join(DIFFFILE_PATH, result_dir)
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields=[])
    data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields=[])
    cnt = 0
    cnt_update = 0
    cnt_add = 0
    cnt_delete = 0

    for _batch in data_new:
        cnt += 1
        id_list_new = [_doc['_id'] for _doc in _batch]
        docs_common = b1.target_collection.find({'_id': {'$in': id_list_new}}, projection=[])
        ids_common = [_doc['_id'] for _doc in docs_common]
        id_in_new = list(set(id_list_new) - set(ids_common))
        _updates = []
        if len(ids_common) > 0:
            _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common), fastdiff=True)
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {'add': id_in_new,
                   'update': _updates,
                   'delete': [],
                   'source': b2.target_collection.name,
                   'timestamp': get_timestamp()}
        if len(_updates) != 0 or len(id_in_new) != 0:
            dump(_result, file_name)
            print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='')
            cnt_update += len(_updates)
            cnt_add += len(id_in_new)
    print("Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}".format(cnt_update, cnt_add))
    print("="*100)
    for _batch in data_old:
        cnt += 1
        id_list_old = [_doc['_id'] for _doc in _batch]
        docs_common = b2.target_collection.find({'_id': {'$in': id_list_old}}, projection=[])
        ids_common = [_doc['_id'] for _doc in docs_common]
        id_in_old = list(set(id_list_old)-set(ids_common))
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {'delete': id_in_old,
                   'add': [],
                   'update': [],
                   'source': b2.target_collection.name,
                   'timestamp': get_timestamp()}
        if len(id_in_old) != 0:
            dump(_result, file_name)
            print("(Deleted: {})".format(len(id_in_old)), end='')
            cnt_delete += len(id_in_old)
    print("Finished calculating diff for the old collection. Total number of docs deleted: {}".format(cnt_delete))
    print("="*100)
    print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(cnt_update, cnt_add, cnt_delete))
Пример #2
0
    def _build_index_sequential(self,
                                collection,
                                verbose=False,
                                query=None,
                                bulk=True,
                                update=False,
                                allow_upsert=True):
        from utils.mongo import doc_feeder

        def rate_control(cnt, t):
            delay = 0
            if t > 90:
                delay = 30
            elif t > 60:
                delay = 10
            if delay:
                print("\tPausing for {}s...".format(delay), end='')
                time.sleep(delay)
                print("done.")

        src_docs = doc_feeder(collection,
                              step=self.step,
                              s=self.s,
                              batch_callback=rate_control,
                              query=query)
        if bulk:
            if update:
                # input doc will update existing one
                # if allow_upsert, create new one if not exist
                res = self.update_docs(src_docs, upsert=allow_upsert)
            else:
                # input doc will overwrite existing one
                res = self.index_bulk(src_docs)
            if len(res[1]) > 0:
                print("Error: {} docs failed indexing.".format(len(res[1])))
                file_name = collection + '_es_error.pyobj'
                dump(res, file_name)
            return res[0]
        else:
            cnt = 0
            for doc in src_docs:
                self.index(doc)
                cnt += 1
                if verbose:
                    print(cnt, ':', doc['_id'])
            return cnt
Пример #3
0
def update_from_temp_collections(config, no_confirm=False, use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(
        len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config,
                                                    time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Пример #4
0
def update_from_temp_collections(config,no_confirm=False,use_parallel=False):
    t0 = time.time()
    sc = GeneDocSyncer(config)
    new_src_li = sc.get_new_source_list()
    if not new_src_li:
        logging.info("No new source collections need to update. Abort now.")
        return

    logging.info("Found {} new source collections need to update:".format(len(new_src_li)))
    logging.info("\n".join(['\t' + x for x in new_src_li]))

    if no_confirm or ask('Continue?') == 'Y':
        logfile = 'databuild_sync_{}_{}.log'.format(config, time.strftime('%Y%m%d'))
        logfile = os.path.join(LOG_FOLDER, logfile)
        setup_logfile(logfile)

        for src in new_src_li:
            t0 = time.time()
            logging.info("Current source collection: %s" % src)
            ts = _get_timestamp(src, as_str=True)
            logging.info("Calculating changes... ")
            changes = sc.get_changes(src, use_parallel=use_parallel)
            logging.info("Done")
            get_changes_stats(changes)
            if no_confirm or ask("Continue to save changes...") == 'Y':
                if config == 'genedoc_mygene':
                    dumpfile = 'changes_{}.pyobj'.format(ts)
                else:
                    dumpfile = 'changes_{}_allspecies.pyobj'.format(ts)
                dump(changes, dumpfile)
                dumpfile_key = 'genedoc_changes/' + dumpfile
                logging.info('Saving to S3: "{}"... '.format(dumpfile_key))
                send_s3_file(dumpfile, dumpfile_key)
                logging.info('Done.')
                #os.remove(dumpfile)

            if no_confirm or ask("Continue to apply changes...") == 'Y':
                sc.apply_changes(changes)
                sc.verify_changes(changes)
            logging.info('=' * 20)
            logging.info("Finished. %s" % timesofar(t0))
Пример #5
0
    def _build_index_sequential(self, collection, verbose=False, query=None, bulk=True, update=False, allow_upsert=True):
        from utils.mongo import doc_feeder

        def rate_control(cnt, t):
            delay = 0
            if t > 90:
                delay = 30
            elif t > 60:
                delay = 10
            if delay:
                print("\tPausing for {}s...".format(delay), end='')
                time.sleep(delay)
                print("done.")

        src_docs = doc_feeder(collection, step=self.step, s=self.s, batch_callback=rate_control, query=query)
        if bulk:
            if update:
                # input doc will update existing one
                # if allow_upsert, create new one if not exist
                res = self.update_docs(src_docs, upsert=allow_upsert)
            else:
                # input doc will overwrite existing one
                res = self.index_bulk(src_docs)
            if len(res[1]) > 0:
                print("Error: {} docs failed indexing.".format(len(res[1])))
                file_name = collection + '_es_error.pyobj'
                dump(res, file_name)
            return res[0]
        else:
            cnt = 0
            for doc in src_docs:
                self.index(doc)
                cnt += 1
                if verbose:
                    print(cnt, ':', doc['_id'])
            return cnt
Пример #6
0
 def finalize(self):
     '''dump target_dict into a file.'''
     from utils.common import dump
     dump(self.target_dict, self.target_name+'.pyobj')
Пример #7
0
def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000):
    '''
    b2 is new collection, b1 is old collection
    '''
    if use_parallel:
        import multiprocessing
        from functools import partial
    DATA_FOLDER = result_dir
    data_new = doc_feeder(b2.target_collection,
                          step=step,
                          inbatch=True,
                          fields={'_id': 1})
    data_old = doc_feeder(b1.target_collection,
                          step=step,
                          inbatch=True,
                          fields={'_id': 1})
    cnt = 0
    cnt_update = 0
    cnt_add = 0
    cnt_delete = 0
    _timestamp = get_timestamp()
    if not os.path.exists(DATA_FOLDER):
        os.mkdir(DATA_FOLDER)
    for batch in data_new:
        cnt += 1
        id_list_new = [doc['_id'] for doc in batch]
        ids_common = [
            doc['_id']
            for doc in b1.target_collection.find({'_id': {
                '$in': id_list_new
            }}, {'_id': 1})
        ]
        id_in_new = list(set(id_list_new) - set(ids_common))
        _updates = []
        if len(ids_common) > 0:
            if use_parallel:
                step = int(len(ids_common) / multiprocessing.cpu_count())
                task_list = [
                    ids_common[i:i + step]
                    for i in range(0, len(ids_common), step)
                ]
                pool = multiprocessing.Pool()
                partial_worker = partial(_diff_parallel_worker,
                                         b1.target_collection.name,
                                         b2.target_collection.name)
                results = pool.map(partial_worker, task_list)
                pool.close()
                pool.join()
                for result in results:
                    _updates += result
            else:
                _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common))
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        _result = {
            'add': id_in_new,
            'update': _updates,
            'delete': [],
            'source': b2.target_collection.name,
            'timestamp': _timestamp
        }
        if len(_updates) != 0 or len(id_in_new) != 0:
            dump(_result, file_name)
            print("(Updated: {}, Added: {})".format(len(_updates),
                                                    len(id_in_new)),
                  end='')
            cnt_update += len(_updates)
            cnt_add += len(id_in_new)
    print(
        "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}"
        .format(cnt_update, cnt_add))
    print("=" * 100)
    for _batch in data_old:
        cnt += 1
        id_list_old = [_doc['_id'] for _doc in _batch]
        ids_common = [
            doc['_id']
            for doc in b2.target_collection.find({'_id': {
                '$in': id_list_old
            }}, {'_id': 1})
        ]
        id_in_old = list(set(id_list_old) - set(ids_common))
        _result = {
            'delete': id_in_old,
            'add': [],
            'update': [],
            'source': b2.target_collection.name,
            'timestamp': _timestamp
        }
        file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj'
        if len(id_in_old) != 0:
            dump(_result, file_name)
            print("(Deleted: {})".format(len(id_in_old)), end='')
            cnt_delete += len(id_in_old)
    print(
        "Finished calculating diff for the old collection. Total number of docs deleted: {}"
        .format(cnt_delete))
    print("=" * 100)
    print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(
        cnt_update, cnt_add, cnt_delete))
Пример #8
0
 def finalize(self):
     '''dump target_dict into a file.'''
     from utils.common import dump
     dump(self.target_dict, self.target_name + '.pyobj')
Пример #9
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may
       contain retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([TAXONOMY[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li  # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2),
                            1,
                            alwayslist=0,
                            includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those
    # mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))
    # convert key/value to int
    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d
Пример #10
0
def get_geneid_d(species_li=None, load_cache=True, save_cache=True):
    '''return a dictionary of current/retired geneid to current geneid mapping.
       This is useful, when other annotations were mapped to geneids may contain
       retired gene ids.

       if species_li is None, genes from all species are loaded.

       Note that all ids are int type.
    '''
    if species_li:
        taxid_set = set([taxid_d[species] for species in species_li])
    else:
        taxid_set = None

    orig_cwd = os.getcwd()
    os.chdir(DATA_FOLDER)

    # check cache file
    _cache_file = 'gene/geneid_d.pyobj'
    if load_cache and os.path.exists(_cache_file) and \
       file_newer(_cache_file, 'gene/gene_info.gz') and \
       file_newer(_cache_file, 'gene/gene_history.gz'):

        print('Loading "geneid_d" from cache file...', end='')
        _taxid_set, out_d = loadobj(_cache_file)
        assert _taxid_set == taxid_set
        print('Done.')
        os.chdir(orig_cwd)
        return out_d

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_info.gz')
    load_start(DATAFILE)
    if species_li:
        species_filter = lambda ld: int(ld[0]) in taxid_set
    else:
        species_filter = None
    geneid_li = set(tab2list(DATAFILE, 1, includefn=species_filter))
    load_done('[%d]' % len(geneid_li))

    DATAFILE = os.path.join(DATA_FOLDER, 'gene/gene_history.gz')
    load_start(DATAFILE)

    if species_li:
        _includefn = lambda ld: int(ld[0]) in taxid_set and ld[1] in geneid_li
    else:
        _includefn = lambda ld: ld[1] in geneid_li    # include all species
    retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0, includefn=_includefn)
    # includefn above makes sure taxid is for species_li and filters out those mapped_to geneid exists in gene_info list

    load_done('[%d]' % len(retired2gene))

    out_d = dict_convert(retired2gene, keyfn=int, valuefn=int)    # convert key/value to int
    for g in geneid_li:
        _g = int(g)
        out_d[_g] = _g

    if save_cache:
        if species_li:
            dump((taxid_set, out_d), _cache_file)
        else:
            dump((None, out_d), _cache_file)

    os.chdir(orig_cwd)
    return out_d