def clone_index(createidx=False, test=True): if test: return from utils.es import ESIndexer from utils.common import iter_n new_idx = 'myvariant_current_3' step = 10000 if createidx: from mapping import get_mapping m = get_mapping() body = {'settings': {'number_of_shards': 10}} # ### es.indices.create(new_idx, body=body) es.indices.put_mapping(index=new_idx, doc_type='variant', body=m) # helpers.reindex(es, source_index='myvariant_all', # target_index= new_idx, chunk_size=10000) esi = ESIndexer() doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step) for doc_batch in iter_n(doc_iter, step): do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
class ESSyncer(): def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer() self._esi._index = self._index self._src = get_src_db() self.step = step def add(self, collection, ids): # compare id_list with current index, get list of ids with true/false indicator id_list = [] id_list_all = [] cnt = 0 for _id in ids: id_list.append(_id) cnt += 1 if len(id_list) == 100: id_list_all += self._esi.mexists(id_list, verbose=False) id_list = [] if id_list: id_list_all += self._esi.mexists(id_list, verbose=False) cnt_update = 0 cnt_create = 0 for _id, _exists in id_list_all: # case one: this id exists in current index, then just update if _exists: es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, 'doc': self._src[collection].find_one({'_id': _id}) } cnt_update += 1 # case two: this id not exists in current index, then create a new one else: es_info = { '_op_type': 'create', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': self._src[collection].find_one({'_id': _id}) } cnt_create += 1 yield es_info print('items updated: ', cnt_update) print('items newly created: ', cnt_create) def delete(self, field, ids): cnt_update = 0 cnt_delete = 0 for _id in ids: # get doc from index based on id if self._esi.exists(_id): doc = self._esi.get_variant(_id)['_source'] # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item if set(doc) == set([field]) or set(doc) == set( [field, 'snpeff', 'vcf']): es_info = { '_op_type': 'delete', '_index': self._index, '_type': self._doc_type, "_id": _id, } cnt_delete += 1 # case two: exists fields other than snpeff, vcf and target field else: # get rid of the target field, delete original doc, update the new doc # plus count es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, "script": 'ctx._source.remove("{}")'.format(field) } cnt_update += 1 yield es_info else: print('id not exists: ', _id) print('items updated: ', cnt_update) print('items deleted: ', cnt_delete) def _update_one(self, _id, _patch): doc = self._esi.get_variant(_id)['_source'] doc = apply_patch(doc, _patch) es_info = { '_op_type': 'index', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': doc } return es_info def update(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) yield _es_info else: print('id not exists:', _id) def update1(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) self._esi.delete_doc(_id) yield _es_info else: print('id not exists:', _id) def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("=" * 20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
class ESSyncer(): def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer() self._esi._index = self._index self._src = get_src_db() self.step = step def add(self, collection, ids): # compare id_list with current index, get list of ids with true/false indicator id_list = [] id_list_all = [] cnt = 0 for _id in ids: id_list.append(_id) cnt += 1 if len(id_list) == 100: id_list_all += self._esi.mexists(id_list, verbose=False) id_list = [] if id_list: id_list_all += self._esi.mexists(id_list, verbose=False) cnt_update = 0 cnt_create = 0 for _id, _exists in id_list_all: # case one: this id exists in current index, then just update if _exists: es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, 'doc': self._src[collection].find_one({'_id': _id}) } cnt_update += 1 # case two: this id not exists in current index, then create a new one else: es_info = { '_op_type': 'create', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': self._src[collection].find_one({'_id': _id}) } cnt_create += 1 yield es_info print('items updated: ', cnt_update) print('items newly created: ', cnt_create) def delete(self, field, ids): cnt_update = 0 cnt_delete = 0 for _id in ids: # get doc from index based on id if self._esi.exists(_id): doc = self._esi.get_variant(_id)['_source'] # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item if set(doc) == set([field]) or set(doc) == set([field, 'snpeff', 'vcf']): es_info = { '_op_type': 'delete', '_index': self._index, '_type': self._doc_type, "_id": _id, } cnt_delete += 1 # case two: exists fields other than snpeff, vcf and target field else: # get rid of the target field, delete original doc, update the new doc # plus count es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, "script": 'ctx._source.remove("{}")'.format(field) } cnt_update += 1 yield es_info else: print('id not exists: ', _id) print('items updated: ', cnt_update) print('items deleted: ', cnt_delete) def _update_one(self, _id, _patch): doc = self._esi.get_variant(_id)['_source'] doc = apply_patch(doc, _patch) es_info = { '_op_type': 'index', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': doc } return es_info def update(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) yield _es_info else: print('id not exists:', _id) def update1(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) self._esi.delete_doc(_id) yield _es_info else: print('id not exists:', _id) def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("="*20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result