示例#1
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']


    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                           sync_src.name,
                                           sync_src.count())
    print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                             sync_target.name,
                                             sync_target.count())
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
示例#2
0
文件: sync.py 项目: SuLab/mygene.info
    def get_changes(self, source_col, use_parallel=True):
        target_col = self._target_col
        source_col = self._db[source_col] if is_str(source_col) else source_col

        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        changes = diff_collections(target, src, use_parallel=use_parallel, step=self.step)
        if changes:
            changes['source'] = source_col.name
            changes['timestamp'] = _get_timestamp(source_col.name)
        return changes
示例#3
0
def main(s1, s2, p=False):
    conn1, dbstr1, colstr1 = s1
    conn2, dbstr2, colstr2 = s2
    col1 = MongoClient(conn1)[dbstr1][colstr1]
    col2 = MongoClient(conn2)[dbstr2][colstr2]

    b1 = GeneDocMongoDBBackend(col1)
    b2 = GeneDocMongoDBBackend(col2)

    changes = diff.diff_collections(b1, b2, use_parallel=p, step=1000)

    return changes
示例#4
0
    def sync_index(self, use_parallel=True):
        from utils import diff

        sync_src = self.get_target_collection()

        es_idxer = ESIndexer(self.get_mapping())
        es_idxer.ES_INDEX_NAME = sync_src.target_collection.name
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        sync_target = databuild.backend.GeneDocESBackend(es_idxer)

        changes = diff.diff_collections(sync_src, sync_target)
        return changes
示例#5
0
    def sync_index(self, use_parallel=True):
        from utils import diff

        sync_src = self.get_target_collection()

        es_idxer = ESIndexer(self.get_mapping())
        es_idxer.ES_INDEX_NAME = sync_src.target_collection.name
        es_idxer.step = 10000
        es_idxer.use_parallel = use_parallel
        sync_target = databuild.backend.GeneDocESBackend(es_idxer)

        changes = diff.diff_collections(sync_src, sync_target)
        return changes
示例#6
0
    def get_changes(self, source_col, use_parallel=True):
        target_col = self._target_col
        source_col = self._db[source_col] if is_str(source_col) else source_col

        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocMongoDBBackend(target_col)
        changes = diff_collections(target,
                                   src,
                                   use_parallel=use_parallel,
                                   step=self.step)
        if changes:
            changes['source'] = source_col.name
            changes['timestamp'] = _get_timestamp(source_col.name)
        return changes
示例#7
0
 def main(self, index, collection, diff_filepath, validate=False, wait=60):
     self._index = index
     self._esi._index = index
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_list = self.add(source_collection, diff['add'])
     delete_list = self.delete(collection, diff['delete'])
     update_list = self.update(diff['update'])
     t00 = time()
     print('Adding new {} docs...'.format(len(diff['add'])))
     t0 = time()
     bulk(self._es, add_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Deleting {} docs'.format(len(diff['delete'])))
     t0 = time()
     bulk(self._es, delete_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Updating {} docs'.format(len(diff['update'])))
     t0 = time()
     bulk(self._es, update_list)
     print("Done. [{}]".format(timesofar(t0)))
     print("=" * 20)
     print("Finished! [{}]".format(timesofar(t00)))
     if validate:
         print('Waiting {}s to let ES to finish...'.format(wait), end="")
         sleep(wait)
         print("Done.")
         print("Validating...")
         t0 = time()
         q = {
             "query": {
                 "constant_score": {
                     "filter": {
                         "exists": {
                             "field": collection
                         }
                     }
                 }
             }
         }
         data = self._esi.doc_feeder(query=q, _source=collection)
         temp_collection = collection + '_temp_' + get_random_string()
         self._src[temp_collection].drop()
         load_source(temp_collection, src_data=data)
         c1 = get_backend(source_collection, 'mongodb')
         c2 = get_backend(temp_collection, 'mongodb')
         diff_result = diff_collections(c1, c2, use_parallel=False)
         self._src[temp_collection].drop()
         print("Done. [{}]".format(t0))
         return diff_result
示例#8
0
    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
	    print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result
示例#9
0
def validate(build_config=None):
    from pprint import pprint
    from utils.diff import diff_collections
    from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend
    from biothings.utils.mongo import get_src_build, get_target_db
    from utils.es import ESIndexer

    src_build = get_src_build()
    _cfg = src_build.find_one({'_id': build_config})
    last_build = _cfg['build'][-1]
    print("Last build record:")
    pprint(last_build)
    target_name = last_build['target']

    mongo_target = get_target_db()
    b1 = GeneDocMongoDBBackend(mongo_target[target_name])
    b2 = GeneDocESBackend(ESIndexer(es_index_name=target_name,
                                    es_host='127.0.0.1:' + str(es_local_tunnel_port)))
    changes = diff_collections(b1, b2, use_parallel=True, step=10000)
    return changes
示例#10
0
def validate(build_config=None):
    from pprint import pprint
    from utils.diff import diff_collections
    from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend
    from biothings.utils.mongo import get_src_build, get_target_db
    from utils.es import ESIndexer

    src_build = get_src_build()
    _cfg = src_build.find_one({'_id': build_config})
    last_build = _cfg['build'][-1]
    print("Last build record:")
    pprint(last_build)
    target_name = last_build['target']

    mongo_target = get_target_db()
    b1 = GeneDocMongoDBBackend(mongo_target[target_name])
    b2 = GeneDocESBackend(
        ESIndexer(es_index_name=target_name,
                  es_host='127.0.0.1:' + str(es_local_tunnel_port)))
    changes = diff_collections(b1, b2, use_parallel=True, step=10000)
    return changes
示例#11
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb')
                   for name in sorted(target_db.collection_names())
                   if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1))
    print('\tsync_target\t{:<45}{}\t{}'.format(*src_2))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src,
                                        sync_target,
                                        use_parallel=use_parallel)
        return changes
示例#12
0
def sync_index(config, use_parallel=True, noconfirm=False):

    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    target_collection = bdr.pick_target_collection()
    target_es_index = 'genedoc_' + bdr._build_config['name']

    sync_src = backend.GeneDocMongoDBBackend(target_collection)

    es_idxer = ESIndexer(bdr.get_mapping())
    es_idxer.ES_INDEX_NAME = target_es_index
    es_idxer.step = 10000
    es_idxer.use_parallel = use_parallel
    sync_target = backend.GeneDocESBackend(es_idxer)

    print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name,
                                             sync_src.name, sync_src.count()))
    print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index,
                                               sync_target.name,
                                               sync_target.count()))
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target)
        return changes
示例#13
0
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices=[]
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print "Found {} sources:".format(len(src_li))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
    for src in (src_1, src_2):
        if src[2] == 'mongodb':
            b = backend.GeneDocMongoDBBackend(target_db[src[0]])
        elif src[2] == 'es':
            es_idxer = ESIndexer()
            es_idxer.ES_INDEX_NAME = src[0]
            es_idxer.step = 10000
            b = backend.GeneDocESBackend(es_idxer)
        sync_li.append(b)

    sync_src, sync_target = sync_li
    print '\tsync_src:\t{:<45}{}\t{}'.format(*src_1)
    print '\tsync_target\t{:<45}{}\t{}'.format(*src_2)
    if noconfirm or ask("Continue?") == "Y":
        changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel)
        return changes
示例#14
0
def diff_two(col_1, col_2, use_parallel=True):
    target = get_target_db()
    b1 = GeneDocMongoDBBackend(target[col_1])
    b2 = GeneDocMongoDBBackend(target[col_2])
    return diff_collections(b1, b2, use_parallel=use_parallel)
示例#15
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result
示例#16
0
文件: sync.py 项目: SuLab/mygene.info
def diff_two(col_1, col_2, use_parallel=True):
    target = get_target_db()
    b1 = GeneDocMongoDBBackend(target[col_1])
    b2 = GeneDocMongoDBBackend(target[col_2])
    return diff_collections(b1, b2, use_parallel=use_parallel)