def pre_index(new_index, old_index, alias, indexer, settings): """ This sets up everything needed before indexing: * Flags the database. * Creates the new index. """ # Flag the database to indicate that the reindexing has started. _print('Flagging the database to start the reindexation.', alias) Reindexing.flag_reindexing(new_index=new_index, old_index=old_index, alias=alias) time.sleep(5) # Give celeryd some time to flag the DB. _print('Creating the mapping for index {index}.'.format(index=new_index), alias) # Update settings with mapping. settings = { 'settings': settings, 'mappings': indexer.get_mapping(), } # Create index and mapping. try: ES.indices.create(index=new_index, body=settings) except elasticsearch.ElasticsearchException as e: raise CommandError('ERROR: New index [%s] already exists? %s' % (new_index, e)) # Don't return until the health is green. By default waits for 30s. ES.cluster.health(index=new_index, wait_for_status='green', wait_for_relocating_shards=0)
def pre_index(new_index, old_index, alias, index_name, settings): """ This sets up everything needed before indexing: * Flags the database. * Creates the new index. """ indexer = INDEXER_MAP[index_name] # Flag the database to indicate that the reindexing has started. _print('Flagging the database to start the reindexation.', alias) Reindexing.flag_reindexing(new_index=new_index, old_index=old_index, alias=alias) time.sleep(5) # Give the celery worker some time to flag the DB. _print('Creating the mapping for index {index}.'.format(index=new_index), alias) # Update settings with mapping. settings = { 'settings': settings, 'mappings': indexer.get_mapping(), } # Create index and mapping. try: ES.indices.create(index=new_index, body=settings) except elasticsearch.ElasticsearchException as e: raise CommandError('ERROR: New index [%s] already exists? %s' % (new_index, e)) # Don't return until the health is green. By default waits for 30s. ES.cluster.health(index=new_index, wait_for_status='green', wait_for_relocating_shards=0)
def test_get_indices(self): # Not reindexing. assert not Reindexing.objects.filter(alias='foo').exists() assert Reindexing.get_indices('foo') == ['foo'] # Reindexing on 'foo'. Reindexing.objects.create(alias='foo', new_index='bar', old_index='baz') self.assertSetEqual(Reindexing.get_indices('foo'), ['bar', 'baz']) # Doesn't clash on other aliases. self.assertSetEqual(Reindexing.get_indices('other'), ['other'])
def test_flag_reindexing(self): assert Reindexing.objects.count() == 0 # Flagging for the first time. res = Reindexing.flag_reindexing('foo', 'bar', 'baz') eq_(Reindexing.objects.filter(alias='foo').count(), 1) eq_(res.alias, 'foo') eq_(res.old_index, 'bar') eq_(res.new_index, 'baz') # Flagging for the second time. res = Reindexing.flag_reindexing('foo', 'bar', 'baz') assert Reindexing.objects.filter(alias='foo').count() == 1 assert res is None
def unindexer(cls, ids=None, _all=False, index=None): """ Empties an index, but doesn't delete it. Useful for tearDowns. ids -- list of IDs to unindex. _all -- unindex all objects. """ if _all: # Mostly used for test tearDowns. qs = cls.get_model() if hasattr(qs, 'with_deleted'): qs = qs.with_deleted else: qs = qs.objects ids = list(qs.order_by('id').values_list('id', flat=True)) if not ids: return log.info('Unindexing %s %s-%s. [%s]' % (cls.get_model()._meta.model_name, ids[0], ids[-1], len(ids))) index = index or cls.get_index() # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = Reindexing.get_indices(index) es = cls.get_es(urls=settings.ES_URLS) for id_ in ids: for idx in indices: try: cls.unindex(id_=id_, es=es, index=idx) except elasticsearch.exceptions.NotFoundError: # Ignore if it's not there. log.info(u'[%s:%s] object not found in index' % (cls.get_model()._meta.model_name, id_))
def post_index(new_index, old_index, alias, index_name, settings): """ Perform post-indexing tasks: * Optimize (which also does a refresh and a flush by default). * Update settings to reset number of replicas. * Point the alias to this new index. * Unflag the database. * Remove the old index. * Output the current alias configuration. """ _print('Optimizing, updating settings and aliases.', alias) # Optimize. ES.indices.optimize(index=new_index) # Update the replicas. ES.indices.put_settings(index=new_index, body=settings) # Add and remove aliases. actions = [ {'add': {'index': new_index, 'alias': alias}} ] if old_index: actions.append( {'remove': {'index': old_index, 'alias': alias}} ) ES.indices.update_aliases(body=dict(actions=actions)) _print('Unflagging the database.', alias) Reindexing.unflag_reindexing(alias=alias) _print('Removing index {index}.'.format(index=old_index), alias) if old_index and ES.indices.exists(index=old_index): ES.indices.delete(index=old_index) alias_output = '' for indexer in INDEXERS: alias = ES_INDEXES[indexer.get_mapping_type_name()] alias_output += unicode(ES.indices.get_aliases(index=alias)) + '\n' _print('Reindexation done. Current aliases configuration: ' '{output}\n'.format(output=alias_output), alias)
def post_index(new_index, old_index, alias, index_name, settings): """ Perform post-indexing tasks: * Optimize (which also does a refresh and a flush by default). * Update settings to reset number of replicas. * Point the alias to this new index. * Unflag the database. * Remove the old index. * Output the current alias configuration. """ _print('Optimizing, updating settings and aliases.', alias) # Optimize. ES.indices.optimize(index=new_index) # Update the replicas. ES.indices.put_settings(index=new_index, body=settings) # Add and remove aliases. actions = [{'add': {'index': new_index, 'alias': alias}}] if old_index: actions.append({'remove': {'index': old_index, 'alias': alias}}) ES.indices.update_aliases(body=dict(actions=actions)) _print('Unflagging the database.', alias) Reindexing.unflag_reindexing(alias=alias) _print('Removing index {index}.'.format(index=old_index), alias) if old_index and ES.indices.exists(index=old_index): ES.indices.delete(index=old_index) alias_output = '' for indexer in INDEXERS: alias = ES_INDEXES[indexer.get_mapping_type_name()] alias_output += unicode(ES.indices.get_aliases(index=alias)) + '\n' _print( 'Reindexation done. Current aliases configuration: ' '{output}\n'.format(output=alias_output), alias)
def index(ids, indexer, **kw): """ Given a list of IDs and an indexer, index into ES. If an reindexation is currently occurring, index on both the old and new. """ task_log.info("Indexing {0} {1}-{2}. [{3}]".format(indexer.get_model()._meta.model_name, ids[0], ids[-1], len(ids))) # If reindexing is currently occurring, index on both old and new indexes. indices = Reindexing.get_indices(indexer.get_index()) es = indexer.get_es(urls=settings.ES_URLS) for obj in indexer.get_indexable().filter(id__in=ids): doc = indexer.extract_document(obj.id, obj) for idx in indices: indexer.index(doc, id_=obj.id, es=es, index=idx)
def index_webapps(ids, **kw): """TODO: use search/indexers.py:index.""" task_log.info('Indexing apps %s-%s. [%s]' % (ids[0], ids[-1], len(ids))) index = kw.pop('index', WebappIndexer.get_index()) # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = Reindexing.get_indices(index) es = WebappIndexer.get_es(urls=settings.ES_URLS) qs = Webapp.indexing_transformer(Webapp.with_deleted.no_cache().filter( id__in=ids)) for obj in qs: doc = WebappIndexer.extract_document(obj.id, obj) for idx in indices: WebappIndexer.index(doc, id_=obj.id, es=es, index=idx)
def index(ids, indexer, **kw): """ Given a list of IDs and an indexer, index into ES. If an reindexation is currently occurring, index on both the old and new. """ log.info('Indexing {0} {1}-{2}. [{3}]'.format( indexer.get_model()._meta.model_name, ids[0], ids[-1], len(ids))) # If reindexing is currently occurring, index on both old and new indexes. indices = Reindexing.get_indices(indexer.get_index()) es = indexer.get_es(urls=settings.ES_URLS) for obj in indexer.get_indexable().filter(id__in=ids): doc = indexer.extract_document(obj.id, obj) for idx in indices: indexer.index(doc, id_=obj.id, es=es, index=idx)
def unindex_webapps(ids, **kw): if not ids: return task_log.info('Un-indexing apps %s-%s. [%s]' % (ids[0], ids[-1], len(ids))) index = kw.pop('index', WebappIndexer.get_index()) # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = Reindexing.get_indices(index) es = WebappIndexer.get_es(urls=settings.ES_URLS) for id_ in ids: for idx in indices: try: WebappIndexer.unindex(id_=id_, es=es, index=idx) except ElasticHttpNotFoundError: # Ignore if it's not there. task_log.info( u'[Webapp:%s] Unindexing app but not found in index' % id_)
def unindexer(cls, ids=None, _all=False, index=None): """ Empties an index, but doesn't delete it. Useful for tearDowns. ids -- list of IDs to unindex. _all -- unindex all objects. """ if _all: # Mostly used for test tearDowns. qs = cls.get_model() if hasattr(qs, 'with_deleted'): qs = qs.with_deleted else: qs = qs.objects ids = list(qs.order_by('id').values_list('id', flat=True)) if not ids: return task_log.info('Unindexing %s %s-%s. [%s]' % (cls.get_model()._meta.model_name, ids[0], ids[-1], len(ids))) index = index or cls.get_index() # Note: If reindexing is currently occurring, `get_indices` will return # more than one index. indices = Reindexing.get_indices(index) es = cls.get_es(urls=settings.ES_URLS) for id_ in ids: for idx in indices: try: cls.unindex(id_=id_, es=es, index=idx) except elasticsearch.exceptions.NotFoundError: # Ignore if it's not there. task_log.info(u'[%s:%s] object not found in index' % (cls.get_model()._meta.model_name, id_))
def test_unflag_reindexing(self): assert Reindexing.objects.filter(alias='foo').count() == 0 # Unflagging unflagged database does nothing. Reindexing.unflag_reindexing(alias='foo') assert Reindexing.objects.filter(alias='foo').count() == 0 # Flag, then unflag. Reindexing.objects.create(alias='foo', new_index='bar', old_index='baz') assert Reindexing.objects.filter(alias='foo').count() == 1 Reindexing.unflag_reindexing(alias='foo') assert Reindexing.objects.filter(alias='foo').count() == 0 # Unflagging another alias doesn't clash. Reindexing.objects.create(alias='bar', new_index='bar', old_index='baz') Reindexing.unflag_reindexing(alias='foo') assert Reindexing.objects.filter(alias='bar').count() == 1
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ index_choice = kwargs.get('index', None) prefix = kwargs.get('prefix', '') force = kwargs.get('force', False) if index_choice: # If we only want to reindex a subset of indexes. INDEXES = INDEX_CHOICES.get(index_choice, None) if INDEXES is None: raise CommandError('Incorrect index name specified. ' 'Choose one of: %s' % ', '.join(INDEX_CHOICES.keys())) else: INDEXES = INDEXERS if Reindexing.is_reindexing() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: Reindexing.unflag_reindexing() for INDEXER in INDEXES: index_name = INDEXER.get_mapping_type_name() chunk_size = INDEXER.chunk_size alias = ES_INDEXES[index_name] chunks, total = chunk_indexing(INDEXER, chunk_size) if not total: _print('No items to queue.', alias) else: total_chunks = int(ceil(total / float(chunk_size))) _print( 'Indexing {total} items into {n} chunks of size {size}'. format(total=total, n=total_chunks, size=chunk_size), alias) # Get the old index if it exists. try: aliases = ES.indices.get_alias(name=alias).keys() except elasticsearch.NotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + alias) # See how the index is currently configured. if old_index: try: s = (ES.indices.get_settings(index=old_index).get( old_index, {}).get('settings', {})) except elasticsearch.NotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) pre_task = pre_index.si( new_index, old_index, alias, index_name, { 'analysis': INDEXER.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1' }) post_task = post_index.si(new_index, old_index, alias, index_name, { 'number_of_replicas': num_replicas, 'refresh_interval': '5s' }) # Ship it. if not total: # If there's no data we still create the index and alias. chain(pre_task, post_task).apply_async() else: index_tasks = [ run_indexing.si(new_index, index_name, chunk) for chunk in chunks ] if settings.CELERY_ALWAYS_EAGER: # Eager mode and chords don't get along. So we serialize # the tasks as a workaround. index_tasks.insert(0, pre_task) index_tasks.append(post_task) chain(*index_tasks).apply_async() else: chain(pre_task, chord(header=index_tasks, body=post_task)).apply_async() _print('New index and indexing tasks all queued up.')
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ index_choice = kwargs.get('index', None) prefix = kwargs.get('prefix', '') force = kwargs.get('force', False) if index_choice: # If we only want to reindex a subset of indexes. INDEXES = INDEX_CHOICES.get(index_choice, None) if INDEXES is None: raise CommandError( 'Incorrect index name specified. ' 'Choose one of: %s' % ', '.join(INDEX_CHOICES.keys())) else: INDEXES = INDEXERS if Reindexing.is_reindexing() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: Reindexing.unflag_reindexing() for INDEXER in INDEXES: index_name = INDEXER.get_mapping_type_name() chunk_size = INDEXER.chunk_size alias = ES_INDEXES[index_name] chunks, total = chunk_indexing(INDEXER, chunk_size) if not total: _print('No items to queue.', alias) else: total_chunks = int(ceil(total / float(chunk_size))) _print('Indexing {total} items into {n} chunks of size {size}' .format(total=total, n=total_chunks, size=chunk_size), alias) # Get the old index if it exists. try: aliases = ES.indices.get_alias(name=alias).keys() except elasticsearch.NotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + alias) # See how the index is currently configured. if old_index: try: s = (ES.indices.get_settings(index=old_index).get( old_index, {}).get('settings', {})) except elasticsearch.NotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) pre_task = pre_index.si(new_index, old_index, alias, index_name, { 'analysis': INDEXER.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1'}) post_task = post_index.si(new_index, old_index, alias, index_name, {'number_of_replicas': num_replicas, 'refresh_interval': '5s'}) # Ship it. if not total: # If there's no data we still create the index and alias. chain(pre_task, post_task).apply_async() else: index_tasks = [run_indexing.si(new_index, index_name, chunk) for chunk in chunks] if settings.CELERY_ALWAYS_EAGER: # Eager mode and chords don't get along. So we serialize # the tasks as a workaround. index_tasks.insert(0, pre_task) index_tasks.append(post_task) chain(*index_tasks).apply_async() else: chain(pre_task, chord(header=index_tasks, body=post_task)).apply_async() _print('New index and indexing tasks all queued up.')
def flag_database(new_index, old_index, alias): """Flags the database to indicate that the reindexing has started.""" sys.stdout.write('Flagging the database to start the reindexation\n') Reindexing.flag_reindexing(new_index=new_index, old_index=old_index, alias=alias) time.sleep(5) # Give celeryd some time to flag the DB.
def unflag_database(): """Unflag the database to indicate that the reindexing is over.""" sys.stdout.write('Unflagging the database\n') Reindexing.unflag_reindexing()
def test_is_reindexing(self): assert not Reindexing.is_reindexing() Reindexing.objects.create(alias='foo', new_index='bar', old_index='baz') assert Reindexing.is_reindexing()
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ global INDEXES index_choice = kwargs.get('index', None) prefix = kwargs.get('prefix', '') force = kwargs.get('force', False) if index_choice: # If we only want to reindex a subset of indexes. INDEXES = INDEX_DICT.get(index_choice, INDEXES) if Reindexing.is_reindexing() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: Reindexing.unflag_reindexing() for ALIAS, INDEXER, CHUNK_SIZE in INDEXES: chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE) if not total: _print('No items to queue.', ALIAS) else: total_chunks = int(ceil(total / float(CHUNK_SIZE))) _print( 'Indexing {total} items into {n} chunks of size {size}'. format(total=total, n=total_chunks, size=CHUNK_SIZE), ALIAS) # Get the old index if it exists. try: aliases = ES.indices.get_alias(name=ALIAS).keys() except elasticsearch.NotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + ALIAS) # See how the index is currently configured. if old_index: try: s = (ES.indices.get_settings(index=old_index).get( old_index, {}).get('settings', {})) except elasticsearch.NotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) pre_task = pre_index.si( new_index, old_index, ALIAS, INDEXER, { 'analysis': INDEXER.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1' }) post_task = post_index.si(new_index, old_index, ALIAS, INDEXER, { 'number_of_replicas': num_replicas, 'refresh_interval': '5s' }) # Ship it. if not total: # If there's no data we still create the index and alias. chain(pre_task, post_task).apply_async() else: index_tasks = [ run_indexing.si(new_index, INDEXER, chunk) for chunk in chunks ] chain(pre_task, chord(header=index_tasks, body=post_task)).apply_async() _print('New index and indexing tasks all queued up.')
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ global INDEXES index_choice = kwargs.get('index', None) prefix = kwargs.get('prefix', '') force = kwargs.get('force', False) if index_choice: # If we only want to reindex a subset of indexes. INDEXES = INDEX_DICT.get(index_choice, INDEXES) if Reindexing.is_reindexing() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: unflag_database() chain = None old_indexes = [] for ALIAS, INDEXER, CHUNK_SIZE in INDEXES: # Get the old index if it exists. try: aliases = ES.indices.get_alias(name=ALIAS).keys() except elasticsearch.NotFoundError: aliases = [] old_index = aliases[0] if aliases else None old_indexes.append(old_index) # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + ALIAS) # See how the index is currently configured. if old_index: try: s = (ES.indices.get_settings(index=old_index).get( old_index, {}).get('settings', {})) except elasticsearch.NotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) # Flag the database to mark as currently indexing. if not chain: chain = flag_database.si(new_index, old_index, ALIAS) else: chain |= flag_database.si(new_index, old_index, ALIAS) # Create the indexes and mappings. # Note: We set num_replicas=0 here to lower load while re-indexing. # In later step we increase it which results in more efficient bulk # copy in ES. For ES < 0.90 we manually enable compression. chain |= create_index.si(new_index, ALIAS, INDEXER, { 'analysis': INDEXER.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1'}) # Index all the things! chain |= run_indexing.si(new_index, INDEXER, CHUNK_SIZE) # After indexing we optimize the index, adjust settings, and point # alias to the new index. chain |= update_alias.si(new_index, old_index, ALIAS, { 'number_of_replicas': num_replicas, 'refresh_interval': '5s'}) # Unflag the database to mark as done indexing. chain |= unflag_database.si() # Delete the old index, if any. for old_index in old_indexes: if old_index: chain |= delete_index.si(old_index) # All done! chain |= output_summary.si() # Ship it. self.stdout.write('\nNew index and indexing tasks all queued up.\n') os.environ['FORCE_INDEXING'] = '1' try: chain.apply_async() finally: del os.environ['FORCE_INDEXING']
def handle(self, *args, **kwargs): """Set up reindexing tasks. Creates a Tasktree that creates a new indexes and indexes all objects, then points the alias to this new index when finished. """ global INDEXES index_choice = kwargs.get('index', None) prefix = kwargs.get('prefix', '') force = kwargs.get('force', False) if index_choice: # If we only want to reindex a subset of indexes. INDEXES = INDEX_DICT.get(index_choice, INDEXES) if Reindexing.is_reindexing() and not force: raise CommandError('Indexation already occuring - use --force to ' 'bypass') elif force: Reindexing.unflag_reindexing() for ALIAS, INDEXER, CHUNK_SIZE in INDEXES: chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE) if not total: _print('No items to queue.', ALIAS) else: total_chunks = int(ceil(total / float(CHUNK_SIZE))) _print('Indexing {total} items into {n} chunks of size {size}' .format(total=total, n=total_chunks, size=CHUNK_SIZE), ALIAS) # Get the old index if it exists. try: aliases = ES.indices.get_alias(name=ALIAS).keys() except elasticsearch.NotFoundError: aliases = [] old_index = aliases[0] if aliases else None # Create a new index, using the index name with a timestamp. new_index = timestamp_index(prefix + ALIAS) # See how the index is currently configured. if old_index: try: s = (ES.indices.get_settings(index=old_index).get( old_index, {}).get('settings', {})) except elasticsearch.NotFoundError: s = {} else: s = {} num_replicas = s.get('number_of_replicas', settings.ES_DEFAULT_NUM_REPLICAS) num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS) pre_task = pre_index.si(new_index, old_index, ALIAS, INDEXER, { 'analysis': INDEXER.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1'}) post_task = post_index.si(new_index, old_index, ALIAS, INDEXER, { 'number_of_replicas': num_replicas, 'refresh_interval': '5s'}) # Ship it. if not total: # If there's no data we still create the index and alias. chain(pre_task, post_task).apply_async() else: index_tasks = [run_indexing.si(new_index, INDEXER, chunk) for chunk in chunks] chain(pre_task, chord(header=index_tasks, body=post_task)).apply_async() _print('New index and indexing tasks all queued up.')