Exemplo n.º 1
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        if not settings.MARKETPLACE:
            raise CommandError('This command affects only marketplace and '
                               'should be run under Marketplace settings.')

        force = kwargs.get('force', False)
        prefix = kwargs.get('prefix', '')

        if database_flagged() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            unflag_database()

        # The list of indexes that is currently aliased by `ALIAS`.
        try:
            aliases = ES.aliases(ALIAS).keys()
        except pyelasticsearch.exceptions.ElasticHttpNotFoundError:
            aliases = []
        old_index = aliases[0] if aliases else None
        # Create a new index, using the index name with a timestamp.
        new_index = timestamp_index(prefix + ALIAS)

        # See how the index is currently configured.
        if old_index:
            try:
                s = (ES.get_settings(old_index).get(old_index, {})
                                               .get('settings', {}))
            except pyelasticsearch.exceptions.ElasticHttpNotFoundError:
                s = {}
        else:
            s = {}

        num_replicas = s.get('number_of_replicas',
                             settings.ES_DEFAULT_NUM_REPLICAS)
        num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS)

        # Flag the database.
        chain = flag_database.si(new_index, old_index, ALIAS)

        # Create the index and mapping.
        #
        # Note: We set num_replicas=0 here to decrease load while re-indexing.
        # In a later step we increase it which results in a more efficient bulk
        # copy in Elasticsearch.
        # For ES < 0.90 we manually enable compression.
        chain |= create_index.si(new_index, ALIAS, {
            'number_of_replicas': 0, 'number_of_shards': num_shards,
            'store.compress.tv': True, 'store.compress.stored': True,
            'refresh_interval': '-1'})

        # Index all the things!
        chain |= run_indexing.si(new_index)

        # After indexing we optimize the index, adjust settings, and point the
        # alias to the new index.
        chain |= update_alias.si(new_index, old_index, ALIAS, {
            'number_of_replicas': num_replicas, 'refresh_interval': '5s'})

        # Unflag the database.
        chain |= unflag_database.si()

        # Delete the old index, if any.
        if old_index:
            chain |= delete_index.si(old_index)

        chain |= output_summary.si()

        self.stdout.write('\nNew index and indexing tasks all queued up.\n')
        os.environ['FORCE_INDEXING'] = '1'
        try:
            chain.apply_async()
        finally:
            del os.environ['FORCE_INDEXING']
Exemplo n.º 2
0
    def handle(self, *args, **kwargs):
        """Reindexing work.

        Creates a Tasktree that creates new indexes
        over the old ones so the search feature
        works while the indexation occurs
        """
        if not django_settings.MARKETPLACE:
            raise CommandError('This command affects both the marketplace and '
                               'AMO ES storage. But the command can only be '
                               'run from the Marketplace.')

        force = kwargs.get('force', False)

        if database_flagged() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')

        prefix = kwargs.get('prefix', '')
        log('Starting the reindexation')

        if kwargs.get('wipe', False):
            confirm = raw_input("Are you sure you want to wipe all data from "
                                "ES ? (yes/no): ")

            while confirm not in ('yes', 'no'):
                confirm = raw_input('Please enter either "yes" or "no": ')

            if confirm == 'yes':
                unflag_database()
                requests.delete(url('/'))
            else:
                raise CommandError("Aborted.")
        elif force:
            unflag_database()

        # Get list current aliases at /_aliases.
        all_aliases = requests.get(url('/_aliases')).json()

        # building the list of indexes
        indexes = set([prefix + index for index in
                       _ALIASES.values()])

        actions = []

        def add_action(*elmt):
            if elmt in actions:
                return
            actions.append(elmt)

        all_aliases = all_aliases.items()

        # creating a task tree
        log('Building the task tree')
        tree = TaskTree()
        last_action = None

        to_remove = []

        # for each index, we create a new time-stamped index
        for alias in indexes:
            is_stats = 'stats' in alias
            old_index = None

            for aliased_index, alias_ in all_aliases:
                if alias in alias_['aliases'].keys():
                    # mark the index to be removed later
                    old_index = aliased_index
                    to_remove.append(aliased_index)

                    # mark the alias to be removed as well
                    add_action('remove', aliased_index, alias)

            # create a new index, using the alias name with a timestamp
            new_index = timestamp_index(alias)

            # if old_index is None that could mean it's a full index
            # In that case we want to continue index in it
            future_alias = url('/%s' % alias)
            if requests.head(future_alias).status_code == 200:
                old_index = alias

            # flag the database
            step1 = tree.add_task(flag_database, args=[new_index, old_index,
                                                       alias])
            step2 = step1.add_task(create_mapping, args=[new_index, alias])
            step3 = step2.add_task(create_index, args=[new_index, is_stats])
            last_action = step3

            # adding new index to the alias
            add_action('add', new_index, alias)

        # Alias the new index and remove the old aliases, if any.
        renaming_step = last_action.add_task(run_aliases_actions,
                                             args=[actions])

        # unflag the database - there's no need to duplicate the
        # indexing anymore
        delete = renaming_step.add_task(unflag_database)

        # Delete the old indexes, if any
        delete.add_task(delete_indexes, args=[to_remove])

        # let's do it
        log('Running all indexation tasks')

        os.environ['FORCE_INDEXING'] = '1'
        try:
            tree.apply_async()
            time.sleep(10)   # give celeryd some time to flag the DB
            while database_flagged():
                sys.stdout.write('.')
                sys.stdout.flush()
                time.sleep(5)
        finally:
            del os.environ['FORCE_INDEXING']

        sys.stdout.write('\n')

        # let's return the /_aliases values
        aliases = call_es('_aliases').json()
        aliases = json.dumps(aliases, sort_keys=True, indent=4)
        return _SUMMARY % (len(indexes), aliases)
Exemplo n.º 3
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        if not settings.MARKETPLACE:
            raise CommandError('This command affects only marketplace and '
                               'should be run under Marketplace settings.')

        force = kwargs.get('force', False)
        prefix = kwargs.get('prefix', '')

        if database_flagged() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            unflag_database()

        # The list of indexes that is currently aliased by `ALIAS`.
        try:
            aliases = ES.aliases(ALIAS).keys()
        except pyelasticsearch.exceptions.ElasticHttpNotFoundError:
            aliases = []
        old_index = aliases[0] if aliases else None
        # Create a new index, using the index name with a timestamp.
        new_index = timestamp_index(prefix + ALIAS)

        # See how the index is currently configured.
        if old_index:
            try:
                s = (ES.get_settings(old_index).get(old_index,
                                                    {}).get('settings', {}))
            except pyelasticsearch.exceptions.ElasticHttpNotFoundError:
                s = {}
        else:
            s = {}

        num_replicas = s.get('number_of_replicas',
                             settings.ES_DEFAULT_NUM_REPLICAS)
        num_shards = s.get('number_of_shards', settings.ES_DEFAULT_NUM_SHARDS)

        # Flag the database.
        chain = flag_database.si(new_index, old_index, ALIAS)

        # Create the index and mapping.
        #
        # Note: We set num_replicas=0 here to decrease load while re-indexing.
        # In a later step we increase it which results in a more efficient bulk
        # copy in Elasticsearch.
        # For ES < 0.90 we manually enable compression.
        chain |= create_index.si(
            new_index, ALIAS, {
                'analysis': WebappIndexer.get_analysis(),
                'number_of_replicas': 0,
                'number_of_shards': num_shards,
                'store.compress.tv': True,
                'store.compress.stored': True,
                'refresh_interval': '-1'
            })

        # Index all the things!
        chain |= run_indexing.si(new_index)

        # After indexing we optimize the index, adjust settings, and point the
        # alias to the new index.
        chain |= update_alias.si(new_index, old_index, ALIAS, {
            'number_of_replicas': num_replicas,
            'refresh_interval': '5s'
        })

        # Unflag the database.
        chain |= unflag_database.si()

        # Delete the old index, if any.
        if old_index:
            chain |= delete_index.si(old_index)

        chain |= output_summary.si()

        self.stdout.write('\nNew index and indexing tasks all queued up.\n')
        os.environ['FORCE_INDEXING'] = '1'
        try:
            chain.apply_async()
        finally:
            del os.environ['FORCE_INDEXING']
Exemplo n.º 4
0
    def handle(self, *args, **kwargs):
        """Reindexing work.

        Creates a Tasktree that creates new indexes
        over the old ones so the search feature
        works while the indexation occurs
        """
        if not django_settings.MARKETPLACE:
            raise CommandError('This command affects both the marketplace and '
                               'AMO ES storage. But the command can only be '
                               'run from the Marketplace.')

        force = kwargs.get('force', False)

        if database_flagged() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')

        prefix = kwargs.get('prefix', '')
        log('Starting the reindexation')

        if kwargs.get('wipe', False):
            confirm = raw_input("Are you sure you want to wipe all data from "
                                "ES ? (yes/no): ")

            while confirm not in ('yes', 'no'):
                confirm = raw_input('Please enter either "yes" or "no": ')

            if confirm == 'yes':
                unflag_database()
                requests.delete(url('/'))
            else:
                raise CommandError("Aborted.")
        elif force:
            unflag_database()

        # Get list current aliases at /_aliases.
        all_aliases = requests.get(url('/_aliases')).json

        # building the list of indexes
        indexes = set(
            [prefix + index for index in django_settings.ES_INDEXES.values()])

        actions = []

        def add_action(*elmt):
            if elmt in actions:
                return
            actions.append(elmt)

        all_aliases = all_aliases.items()

        # creating a task tree
        log('Building the task tree')
        tree = TaskTree()
        last_action = None

        to_remove = []

        # for each index, we create a new time-stamped index
        for alias in indexes:
            is_stats = 'stats' in alias
            old_index = None

            for aliased_index, alias_ in all_aliases:
                if alias in alias_['aliases'].keys():
                    # mark the index to be removed later
                    old_index = aliased_index
                    to_remove.append(aliased_index)

                    # mark the alias to be removed as well
                    add_action('remove', aliased_index, alias)

            # create a new index, using the alias name with a timestamp
            new_index = timestamp_index(alias)

            # if old_index is None that could mean it's a full index
            # In that case we want to continue index in it
            future_alias = url('/%s' % alias)
            if requests.head(future_alias).status_code == 200:
                old_index = alias

            # flag the database
            step1 = tree.add_task(flag_database,
                                  args=[new_index, old_index, alias])
            step2 = step1.add_task(create_mapping, args=[new_index, alias])
            step3 = step2.add_task(create_index, args=[new_index, is_stats])
            last_action = step3

            # adding new index to the alias
            add_action('add', new_index, alias)

        # Alias the new index and remove the old aliases, if any.
        renaming_step = last_action.add_task(run_aliases_actions,
                                             args=[actions])

        # unflag the database - there's no need to duplicate the
        # indexing anymore
        delete = renaming_step.add_task(unflag_database)

        # Delete the old indexes, if any
        delete.add_task(delete_indexes, args=[to_remove])

        # let's do it
        log('Running all indexation tasks')

        os.environ['FORCE_INDEXING'] = '1'
        try:
            tree.apply_async()
            time.sleep(10)  # give celeryd some time to flag the DB
            while database_flagged():
                sys.stdout.write('.')
                sys.stdout.flush()
                time.sleep(5)
        finally:
            del os.environ['FORCE_INDEXING']

        sys.stdout.write('\n')

        # let's return the /_aliases values
        aliases = call_es('_aliases').json
        aliases = json.dumps(aliases, sort_keys=True, indent=4)
        return _SUMMARY % (len(indexes), aliases)
Exemplo n.º 5
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        global INDEXES

        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_DICT.get(index_choice, INDEXES)

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            Reindexing.unflag_reindexing()

        for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:

            chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE)
            if not total:
                _print('No items to queue.', ALIAS)
            else:
                total_chunks = int(ceil(total / float(CHUNK_SIZE)))
                _print('Indexing {total} items into {n} chunks of size {size}'
                       .format(total=total, n=total_chunks, size=CHUNK_SIZE),
                       ALIAS)

            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=ALIAS).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + ALIAS)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            pre_task = pre_index.si(new_index, old_index, ALIAS, INDEXER, {
                'analysis': INDEXER.get_analysis(),
                'number_of_replicas': 0,
                'number_of_shards': num_shards,
                'store.compress.tv': True,
                'store.compress.stored': True,
                'refresh_interval': '-1'})
            post_task = post_index.si(new_index, old_index, ALIAS, INDEXER, {
                'number_of_replicas': num_replicas,
                'refresh_interval': '5s'})

            # Ship it.
            if not total:
                # If there's no data we still create the index and alias.
                chain(pre_task, post_task).apply_async()
            else:
                index_tasks = [run_indexing.si(new_index, INDEXER, chunk)
                               for chunk in chunks]

                if settings.CELERY_ALWAYS_EAGER:
                    # Eager mode and chords don't get along. So we serialize
                    # the tasks as a workaround.
                    index_tasks.insert(0, pre_task)
                    index_tasks.append(post_task)
                    chain(*index_tasks).apply_async()
                else:
                    chain(pre_task, chord(header=index_tasks,
                                          body=post_task)).apply_async()

        _print('New index and indexing tasks all queued up.')
Exemplo n.º 6
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        global INDEXES

        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_DICT.get(index_choice, INDEXES)

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            unflag_database()

        chain = None
        old_indexes = []
        for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:
            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=ALIAS).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None
            old_indexes.append(old_index)

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + ALIAS)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            # Flag the database to mark as currently indexing.
            if not chain:
                chain = flag_database.si(new_index, old_index, ALIAS)
            else:
                chain |= flag_database.si(new_index, old_index, ALIAS)

            # Create the indexes and mappings.
            # Note: We set num_replicas=0 here to lower load while re-indexing.
            # In later step we increase it which results in more efficient bulk
            # copy in ES. For ES < 0.90 we manually enable compression.
            chain |= create_index.si(new_index, ALIAS, INDEXER, {
                'analysis': INDEXER.get_analysis(),
                'number_of_replicas': 0, 'number_of_shards': num_shards,
                'store.compress.tv': True, 'store.compress.stored': True,
                'refresh_interval': '-1'})

            # Index all the things!
            chain |= run_indexing.si(new_index, INDEXER, CHUNK_SIZE)

            # After indexing we optimize the index, adjust settings, and point
            # alias to the new index.
            chain |= update_alias.si(new_index, old_index, ALIAS, {
                'number_of_replicas': num_replicas, 'refresh_interval': '5s'})

        # Unflag the database to mark as done indexing.
        chain |= unflag_database.si()

        # Delete the old index, if any.
        for old_index in old_indexes:
            if old_index:
                chain |= delete_index.si(old_index)

        # All done!
        chain |= output_summary.si()

        # Ship it.
        self.stdout.write('\nNew index and indexing tasks all queued up.\n')
        os.environ['FORCE_INDEXING'] = '1'
        try:
            chain.apply_async()
        finally:
            del os.environ['FORCE_INDEXING']
Exemplo n.º 7
0
    def handle(self, *args, **kwargs):
        """Set up reindexing tasks.

        Creates a Tasktree that creates a new indexes and indexes all objects,
        then points the alias to this new index when finished.
        """
        global INDEXES

        index_choice = kwargs.get('index', None)
        prefix = kwargs.get('prefix', '')
        force = kwargs.get('force', False)

        if index_choice:
            # If we only want to reindex a subset of indexes.
            INDEXES = INDEX_DICT.get(index_choice, INDEXES)

        if Reindexing.is_reindexing() and not force:
            raise CommandError('Indexation already occuring - use --force to '
                               'bypass')
        elif force:
            Reindexing.unflag_reindexing()

        for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:

            chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE)
            if not total:
                _print('No items to queue.', ALIAS)
            else:
                total_chunks = int(ceil(total / float(CHUNK_SIZE)))
                _print(
                    'Indexing {total} items into {n} chunks of size {size}'.
                    format(total=total, n=total_chunks,
                           size=CHUNK_SIZE), ALIAS)

            # Get the old index if it exists.
            try:
                aliases = ES.indices.get_alias(name=ALIAS).keys()
            except elasticsearch.NotFoundError:
                aliases = []
            old_index = aliases[0] if aliases else None

            # Create a new index, using the index name with a timestamp.
            new_index = timestamp_index(prefix + ALIAS)

            # See how the index is currently configured.
            if old_index:
                try:
                    s = (ES.indices.get_settings(index=old_index).get(
                        old_index, {}).get('settings', {}))
                except elasticsearch.NotFoundError:
                    s = {}
            else:
                s = {}
            num_replicas = s.get('number_of_replicas',
                                 settings.ES_DEFAULT_NUM_REPLICAS)
            num_shards = s.get('number_of_shards',
                               settings.ES_DEFAULT_NUM_SHARDS)

            pre_task = pre_index.si(
                new_index, old_index, ALIAS, INDEXER, {
                    'analysis': INDEXER.get_analysis(),
                    'number_of_replicas': 0,
                    'number_of_shards': num_shards,
                    'store.compress.tv': True,
                    'store.compress.stored': True,
                    'refresh_interval': '-1'
                })
            post_task = post_index.si(new_index, old_index, ALIAS, INDEXER, {
                'number_of_replicas': num_replicas,
                'refresh_interval': '5s'
            })

            # Ship it.
            if not total:
                # If there's no data we still create the index and alias.
                chain(pre_task, post_task).apply_async()
            else:
                index_tasks = [
                    run_indexing.si(new_index, INDEXER, chunk)
                    for chunk in chunks
                ]
                chain(pre_task, chord(header=index_tasks,
                                      body=post_task)).apply_async()

        _print('New index and indexing tasks all queued up.')