def clone_index(createidx=False, test=True): if test: return from utils.es import ESIndexer from utils.common import iter_n new_idx = 'myvariant_current_3' step = 10000 if createidx: from mapping import get_mapping m = get_mapping() body = {'settings': {'number_of_shards': 10}} # ### es.indices.create(new_idx, body=body) es.indices.put_mapping(index=new_idx, doc_type='variant', body=m) # helpers.reindex(es, source_index='myvariant_all', # target_index= new_idx, chunk_size=10000) esi = ESIndexer() doc_iter = esi.doc_feeder(index='myvariant_all_1', doc_type='variant', step=step) for doc_batch in iter_n(doc_iter, step): do_index(doc_batch, index_name=new_idx, doc_type='variant', step=step, verbose=False, update=True)
def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print '\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count()) print '\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count()) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer(es_host=es_host) self._esi._index = self._index self._src = get_src_db() self.step = step
def sync_index(self, use_parallel=True): from utils import diff sync_src = self.get_target_collection() es_idxer = ESIndexer(self.get_mapping()) es_idxer.ES_INDEX_NAME = sync_src.target_collection.name es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = databuild.backend.GeneDocESBackend(es_idxer) changes = diff.diff_collections(sync_src, sync_target) return changes
def _get_ids_worker(args): from utils.es import ESIndexer from pyes import MatchAllQuery es_kwargs, start, step = args q = MatchAllQuery().search() q.sort = [{'entrezgene': 'asc'}, {'ensembl.gene': 'asc'}] q.fields = [] q.start = start q.size = step esi = ESIndexer(**es_kwargs) cnt = esi.count()['count'] res = esi.conn.search_raw(q) assert res['hits']['total'] == cnt return [doc['_id'] for doc in res['hits']['hits']]
def __init__(self, build_config=None, backend='mongodb'): self.src = get_src_db() self.step = 10000 self.use_parallel = False self.merge_logging = True # save output into a logging file when merge is called. self.max_build_status = 10 # max no. of records kept in "build" field of src_build collection. self.using_ipython_cluster = False self.shutdown_ipengines_after_done = False self.log_folder = LOG_FOLDER self._build_config = build_config self._entrez_geneid_d = None self._idmapping_d_cache = {} self.get_src_master() if backend == 'mongodb': self.target = databuild.backend.GeneDocMongoDBBackend() elif backend == 'es': self.target = databuild.backend.GeneDocESBackend(ESIndexer()) elif backend == 'couchdb': from config import COUCHDB_URL import couchdb self.target = databuild.backend.GeneDocCouchDBBackend( couchdb.Server(COUCHDB_URL)) elif backend == 'memory': self.target = databuild.backend.GeneDocMemeoryBackend() else: raise ValueError('Invalid backend "%s".' % backend)
def test(): target = get_target_db() sync_src = backend.GeneDocMongoDBBackend( target['genedoc_mygene_allspecies_20130402_uiu7bkyi']) idxer = ESIndexer() sync_target = backend.GeneDocESBackend(idxer) return sync_src, sync_target
def get_backend(target_name, bk_type, **kwargs): '''Return a backend instance for given target_name and backend type. currently support MongoDB and ES backend. ''' if bk_type == 'mongodb': return GeneDocMongoDBBackend(target_name) elif bk_type == 'es': esi = ESIndexer(target_name, **kwargs) return GeneDocESBackend(esi)
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ from pprint import pprint self.load_build_config(build_config) last_build = self._build_config['build'][last_build_idx] print "Last build record:" pprint(last_build) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) ###### _db = get_target_db() target_collection = _db[target_collection] print print 'Source: ', target_collection.name _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 print "ES host:", es_idxer.conn.servers[0].geturl() print "ES index:", es_index_name if ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.conn.indices.exists_index(es_idxer.ES_INDEX_NAME): if ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) else: print "Abort." return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_es.pTYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def make_test_index(): def get_sample_gene(gene): qbdr = ESQueryBuilder(fields=['_source'], size=1000) _query = qbdr.dis_max_query(gene) _query = qbdr.add_species_custom_filters_score(_query) _q = {'query': _query} if qbdr.options: _q.update(qbdr.options) esq = ESQuery() res = esq._search(_q) return [h['_source'] for h in res['hits']['hits']] gli = get_sample_gene('CDK2') + \ get_sample_gene('BTK') + \ get_sample_gene('insulin') from utils.es import ESIndexer index_name = 'genedoc_2' index_type = 'gene_sample' esidxer = ESIndexer(None, None) conn = esidxer.conn try: esidxer.delete_index_type(index_type) except: pass mapping = dict(conn.get_mapping('gene', index_name)['gene']) print conn.put_mapping(index_type, mapping, [index_name]) print "Building index..." cnt = 0 for doc in gli: conn.index(doc, index_name, index_type, doc['_id']) cnt += 1 print cnt, ':', doc['_id'] print conn.flush() print conn.refresh() print 'Done! - {} docs indexed.'.format(cnt)
def sync_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] sync_src = backend.GeneDocMongoDBBackend(target_collection) es_idxer = ESIndexer(bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel sync_target = backend.GeneDocESBackend(es_idxer) print('\tsync_src:\t{:<40}{}\t{}'.format(target_collection.name, sync_src.name, sync_src.count())) print('\tsync_target\t{:<40}{}\t{}'.format(target_es_index, sync_target.name, sync_target.count())) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target) return changes
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices = [] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print("Found {} sources:".format(len(src_li))) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print('\tsync_src:\t{:<45}{}\t{}'.format(*src_1)) print('\tsync_target\t{:<45}{}\t{}'.format(*src_2)) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def validate(build_config=None): from pprint import pprint from utils.diff import diff_collections from databuild.backend import GeneDocMongoDBBackend, GeneDocESBackend from biothings.utils.mongo import get_src_build, get_target_db from utils.es import ESIndexer src_build = get_src_build() _cfg = src_build.find_one({'_id': build_config}) last_build = _cfg['build'][-1] print("Last build record:") pprint(last_build) target_name = last_build['target'] mongo_target = get_target_db() b1 = GeneDocMongoDBBackend(mongo_target[target_name]) b2 = GeneDocESBackend( ESIndexer(es_index_name=target_name, es_host='127.0.0.1:' + str(es_local_tunnel_port))) changes = diff_collections(b1, b2, use_parallel=True, step=10000) return changes
def diff2src(use_parallel=True, noconfirm=False): src_li = [] target_db = get_target_db() src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')]) es_idxer = ESIndexer() es_idxer.conn.default_indices=[] for es_idx in es_idxer.conn.indices.get_indices(): if es_idx.startswith('genedoc'): es_idxer.ES_INDEX_NAME = es_idx src_li.append((es_idx, es_idxer.count()['count'], 'es')) print "Found {} sources:".format(len(src_li)) src_1 = _pick_one(src_li, "Pick first source above: ") src_li.remove(src_1) print src_2 = _pick_one(src_li, "Pick second source above: ") sync_li = [] for src in (src_1, src_2): if src[2] == 'mongodb': b = backend.GeneDocMongoDBBackend(target_db[src[0]]) elif src[2] == 'es': es_idxer = ESIndexer() es_idxer.ES_INDEX_NAME = src[0] es_idxer.step = 10000 b = backend.GeneDocESBackend(es_idxer) sync_li.append(b) sync_src, sync_target = sync_li print '\tsync_src:\t{:<45}{}\t{}'.format(*src_1) print '\tsync_target\t{:<45}{}\t{}'.format(*src_2) if noconfirm or ask("Continue?") == "Y": changes = diff.diff_collections(sync_src, sync_target, use_parallel=use_parallel) return changes
def build_index(self, use_parallel=True): target_collection = self.get_target_collection() if target_collection: es_idxer = ESIndexer(mapping=self.get_mapping()) es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name'] es_idxer.step = 10000 es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: logging.info("Error: target collection is not ready yet or failed to build.")
def build_index(self, use_parallel=True): target_collection = self.get_target_collection() if target_collection: es_idxer = ESIndexer(mapping=self.get_mapping()) es_idxer.ES_INDEX_NAME = 'genedoc_' + self._build_config['name'] es_idxer.step = 10000 es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: logging.info( "Error: target collection is not ready yet or failed to build." )
class ESSyncer(): def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer(es_host=es_host) self._esi._index = self._index self._src = get_src_db() self.step = step def add(self, collection, ids): # compare id_list with current index, get list of ids with true/false indicator cnt_update = 0 cnt_create = 0 for ids_chunk in iter_n(ids, 100): id_list_all = self._esi.mexists(ids_chunk, verbose=False) for _id, _exists in id_list_all: _doc = self._src[collection].find_one({'_id': _id}) _doc.pop('_id') # case one: this id exists in current index, then just update if _exists: es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, '_id': _id, 'doc': _doc } cnt_update += 1 # case two: this id not exists in current index, then create a new one else: es_info = { '_op_type': 'create', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': _doc } cnt_create += 1 yield es_info print('items updated: ', cnt_update) print('items newly created: ', cnt_create) def delete(self, field, ids): cnt_update = 0 cnt_delete = 0 for _id in ids: # get doc from index based on id if self._esi.exists(_id): doc = self._esi.get_variant(_id)['_source'] doc.pop('_id', None) # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item if len(set(doc) - set([field, 'snpeff', 'vcf', 'hg19', 'hg38', 'chrom'])) == 0: es_info = { '_op_type': 'delete', '_index': self._index, '_type': self._doc_type, "_id": _id, } cnt_delete += 1 # case two: exists fields other than snpeff, vcf and target field else: # get rid of the target field, delete original doc, update the new doc # plus count # this requires enabling ElasticSearch dynamic scripting es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, '_id': _id, "script": 'ctx._source.remove("{}");ctx._source.remove("_id")'.format(field) } cnt_update += 1 yield es_info else:
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format( es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print("ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE)) if noconfirm or ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print("Aborted.") else: print("Error: target collection is not ready yet or failed to build.")
def build_index2(self, build_config='mygene_allspecies', last_build_idx=-1, use_parallel=False, es_host=None, es_index_name=None, noconfirm=False): """Build ES index from last successfully-merged mongodb collection. optional "es_host" argument can be used to specified another ES host, otherwise default ES_HOST. optional "es_index_name" argument can be used to pass an alternative index name, otherwise same as mongodb collection name """ self.load_build_config(build_config) assert "build" in self._build_config, "Abort. No such build records for config %s" % build_config last_build = self._build_config['build'][last_build_idx] logging.info("Last build record:") logging.info(pformat(last_build)) assert last_build['status'] == 'success', \ "Abort. Last build did not success." assert last_build['target_backend'] == "mongodb", \ 'Abort. Last build need to be built using "mongodb" backend.' assert last_build.get('stats', None), \ 'Abort. Last build stats are not available.' self._stats = last_build['stats'] assert last_build.get('target', None), \ 'Abort. Last build target_collection is not available.' # Get the source collection to build the ES index # IMPORTANT: the collection in last_build['target'] does not contain _timestamp field, # only the "genedoc_*_current" collection does. When "timestamp" is enabled # in mappings, last_build['target'] collection won't be indexed by ES correctly, # therefore, we use "genedoc_*_current" collection as the source here: #target_collection = last_build['target'] target_collection = "genedoc_{}_current".format(build_config) _db = get_target_db() target_collection = _db[target_collection] logging.info("") logging.info('Source: %s' % target_collection.name) _mapping = self.get_mapping() _meta = {} src_version = self.get_src_version() if src_version: _meta['src_version'] = src_version if getattr(self, '_stats', None): _meta['stats'] = self._stats if 'timestamp' in last_build: _meta['timestamp'] = last_build['timestamp'] if _meta: _mapping['_meta'] = _meta es_index_name = es_index_name or target_collection.name es_idxer = ESIndexer(mapping=_mapping, es_index_name=es_index_name, es_host=es_host, step=5000) if build_config == 'mygene_allspecies': es_idxer.number_of_shards = 10 # default 5 es_idxer.check() if noconfirm or ask("Continue to build ES index?") == 'Y': es_idxer.use_parallel = use_parallel #es_idxer.s = 609000 if es_idxer.exists_index(es_idxer.ES_INDEX_NAME): if noconfirm or ask('Index "{}" exists. Delete?'.format(es_idxer.ES_INDEX_NAME)) == 'Y': es_idxer.conn.indices.delete(es_idxer.ES_INDEX_NAME) else: logging.info("Abort.") return es_idxer.create_index() #es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=True) es_idxer.build_index(target_collection, verbose=False)
class ESSyncer(): def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer() self._esi._index = self._index self._src = get_src_db() self.step = step def add(self, collection, ids): # compare id_list with current index, get list of ids with true/false indicator id_list = [] id_list_all = [] cnt = 0 for _id in ids: id_list.append(_id) cnt += 1 if len(id_list) == 100: id_list_all += self._esi.mexists(id_list, verbose=False) id_list = [] if id_list: id_list_all += self._esi.mexists(id_list, verbose=False) cnt_update = 0 cnt_create = 0 for _id, _exists in id_list_all: # case one: this id exists in current index, then just update if _exists: es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, 'doc': self._src[collection].find_one({'_id': _id}) } cnt_update += 1 # case two: this id not exists in current index, then create a new one else: es_info = { '_op_type': 'create', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': self._src[collection].find_one({'_id': _id}) } cnt_create += 1 yield es_info print('items updated: ', cnt_update) print('items newly created: ', cnt_create) def delete(self, field, ids): cnt_update = 0 cnt_delete = 0 for _id in ids: # get doc from index based on id if self._esi.exists(_id): doc = self._esi.get_variant(_id)['_source'] # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item if set(doc) == set([field]) or set(doc) == set( [field, 'snpeff', 'vcf']): es_info = { '_op_type': 'delete', '_index': self._index, '_type': self._doc_type, "_id": _id, } cnt_delete += 1 # case two: exists fields other than snpeff, vcf and target field else: # get rid of the target field, delete original doc, update the new doc # plus count es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, "script": 'ctx._source.remove("{}")'.format(field) } cnt_update += 1 yield es_info else: print('id not exists: ', _id) print('items updated: ', cnt_update) print('items deleted: ', cnt_delete) def _update_one(self, _id, _patch): doc = self._esi.get_variant(_id)['_source'] doc = apply_patch(doc, _patch) es_info = { '_op_type': 'index', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': doc } return es_info def update(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) yield _es_info else: print('id not exists:', _id) def update1(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) self._esi.delete_doc(_id) yield _es_info else: print('id not exists:', _id) def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("=" * 20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def build_index(config, use_parallel=True, noconfirm=False): bdr = DataBuilder(backend='mongodb') bdr.load_build_config(config) target_collection = bdr.pick_target_collection() target_es_index = 'genedoc_' + bdr._build_config['name'] if target_collection: es_idxer = ESIndexer(mapping=bdr.get_mapping()) es_idxer.ES_INDEX_NAME = target_es_index es_idxer.step = 10000 es_idxer.use_parallel = use_parallel es_server = es_idxer.conn.servers[0].geturl() print "ES target: {}/{}/{}".format(es_server, es_idxer.ES_INDEX_NAME, es_idxer.ES_INDEX_TYPE) if ask("Continue?") == 'Y': #es_idxer.s = 609000 #es_idxer.conn.indices.delete_index(es_idxer.ES_INDEX_NAME) es_idxer.create_index() es_idxer.delete_index_type(es_idxer.ES_INDEX_TYPE, noconfirm=noconfirm) es_idxer.build_index(target_collection, verbose=False) es_idxer.optimize() else: print "Aborted." else: print "Error: target collection is not ready yet or failed to build."
class ESSyncer(): def __init__(self, index=None, doc_type=None, es_host=None, step=5000): self._es = get_es(es_host) self._index = index or config.ES_INDEX_NAME self._doc_type = doc_type or config.ES_DOC_TYPE self._esi = ESIndexer() self._esi._index = self._index self._src = get_src_db() self.step = step def add(self, collection, ids): # compare id_list with current index, get list of ids with true/false indicator id_list = [] id_list_all = [] cnt = 0 for _id in ids: id_list.append(_id) cnt += 1 if len(id_list) == 100: id_list_all += self._esi.mexists(id_list, verbose=False) id_list = [] if id_list: id_list_all += self._esi.mexists(id_list, verbose=False) cnt_update = 0 cnt_create = 0 for _id, _exists in id_list_all: # case one: this id exists in current index, then just update if _exists: es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, 'doc': self._src[collection].find_one({'_id': _id}) } cnt_update += 1 # case two: this id not exists in current index, then create a new one else: es_info = { '_op_type': 'create', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': self._src[collection].find_one({'_id': _id}) } cnt_create += 1 yield es_info print('items updated: ', cnt_update) print('items newly created: ', cnt_create) def delete(self, field, ids): cnt_update = 0 cnt_delete = 0 for _id in ids: # get doc from index based on id if self._esi.exists(_id): doc = self._esi.get_variant(_id)['_source'] # case one: only exist target field, or target field/snpeff/vcf, then we need to delete this item if set(doc) == set([field]) or set(doc) == set([field, 'snpeff', 'vcf']): es_info = { '_op_type': 'delete', '_index': self._index, '_type': self._doc_type, "_id": _id, } cnt_delete += 1 # case two: exists fields other than snpeff, vcf and target field else: # get rid of the target field, delete original doc, update the new doc # plus count es_info = { '_op_type': 'update', '_index': self._index, '_type': self._doc_type, "_id": _id, "script": 'ctx._source.remove("{}")'.format(field) } cnt_update += 1 yield es_info else: print('id not exists: ', _id) print('items updated: ', cnt_update) print('items deleted: ', cnt_delete) def _update_one(self, _id, _patch): doc = self._esi.get_variant(_id)['_source'] doc = apply_patch(doc, _patch) es_info = { '_op_type': 'index', '_index': self._index, '_type': self._doc_type, "_id": _id, '_source': doc } return es_info def update(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) yield _es_info else: print('id not exists:', _id) def update1(self, id_patchs): for _id_patch in id_patchs: _id = _id_patch['_id'] _patch = _id_patch['patch'] if self._esi.exists(_id): _es_info = self._update_one(_id, _patch) self._esi.delete_doc(_id) yield _es_info else: print('id not exists:', _id) def main(self, index, collection, diff_filepath, validate=False, wait=60): self._index = index self._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_list = self.add(source_collection, diff['add']) delete_list = self.delete(collection, diff['delete']) update_list = self.update(diff['update']) t00 = time() print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() bulk(self._es, add_list) print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() bulk(self._es, delete_list) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() bulk(self._es, update_list) print("Done. [{}]".format(timesofar(t0))) print("="*20) print("Finished! [{}]".format(timesofar(t00))) if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": collection } } } } } data = self._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() self._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) self._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result