def auto_lock_old_questions(): """Locks all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list( Question.objects.filter(is_locked=False).filter( created__lte=days_180).values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_locked = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) transaction.commit_unless_managed() if settings.ES_LIVE_INDEXING: try: es = get_indexing_es() # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from search.utils import chunked for chunk in chunked(q_ids, 1000): # Fetch all the documents we need to update. es_docs = get_documents(Question, chunk) log.info('Updating %d index documents', len(es_docs)) # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_locked'] = True Question.index(doc, bulk=True, es=es) es.flush_bulk(forced=True) es.refresh(WRITE_INDEX, timesleep=0) except (ESTimeoutError, ESMaxRetryError, ESException): # Something happened with ES, so let's push index updating # into an index_task which retries when it fails because # of ES issues. index_task.delay(Question, q_ids)
def auto_lock_old_questions(): """Locks all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list(Question.objects.filter(is_locked=False) .filter(created__lte=days_180) .values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_locked = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) transaction.commit_unless_managed() if settings.ES_LIVE_INDEXING: try: es = get_indexing_es() # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from search.utils import chunked for chunk in chunked(q_ids, 1000): # Fetch all the documents we need to update. es_docs = get_documents(Question, chunk) log.info('Updating %d index documents', len(es_docs)) # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_locked'] = True Question.index(doc, bulk=True, es=es) es.flush_bulk(forced=True) es.refresh(WRITE_INDEX, timesleep=0) except (ESTimeoutError, ESMaxRetryError, ESException): # Something happened with ES, so let's push index updating # into an index_task which retries when it fails because # of ES issues. index_task.delay(Question, q_ids)
def index(cls, document, bulk=False, force_insert=False, es=None): """Indexes a single document""" if not settings.ES_LIVE_INDEXING: return if es is None: # Use es_utils.get_indexing_es() because it uses # ES_INDEXING_TIMEOUT. es = es_utils.get_indexing_es() es.index(document, index=es_utils.WRITE_INDEX, doc_type=es_utils.SUMO_DOCTYPE, id=cls.get_document_id(document['id']), bulk=bulk, force_insert=force_insert)
def unindex(cls, id, es=None): """Removes a document from the index""" if not settings.ES_LIVE_INDEXING: return if es is None: # Use es_utils.get_indexing_es() because it uses # ES_INDEXING_TIMEOUT. es = es_utils.get_indexing_es() try: # TODO: There is a race condition here if this gets called # during reindexing. es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE, id) except pyes.exceptions.NotFoundException: # Ignore the case where we try to delete something that's # not there. pass
def unindex(cls, id_, es=None): """Removes a document from the index""" if not settings.ES_LIVE_INDEXING: return if es is None: # Use es_utils.get_indexing_es() because it uses # ES_INDEXING_TIMEOUT. es = es_utils.get_indexing_es() try: es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE, cls.get_document_id(id_)) except pyes.exceptions.NotFoundException: # Ignore the case where we try to delete something that's # not there. pass
def unindex(cls, id_, es=None): """Removes a document from the index""" if not settings.ES_LIVE_INDEXING: return if es is None: # Use es_utils.get_indexing_es() because it uses # ES_INDEXING_TIMEOUT. es = es_utils.get_indexing_es() try: es.delete(es_utils.WRITE_INDEX, es_utils.SUMO_DOCTYPE, cls.get_document_id(id_)) # Refresh after the delete, but only if the delete was # successful. es.refresh(es_utils.WRITE_INDEX, timesleep=0) except pyes.exceptions.NotFoundException: # Ignore the case where we try to delete something that's # not there. pass
def index(cls, document, bulk=False, force_insert=False, refresh=False, es=None): """Indexes a single document""" if not settings.ES_LIVE_INDEXING: return if es is None: # Use es_utils.get_indexing_es() because it uses # ES_INDEXING_TIMEOUT. es = es_utils.get_indexing_es() index = settings.ES_WRITE_INDEXES['default'] doc_type = cls._meta.db_table es.index(document, index=index, doc_type=doc_type, id=document['id'], bulk=bulk, force_insert=force_insert) if refresh: es.refresh(timesleep=0)
def index(cls, document, bulk=False, force_insert=False, refresh=False, es=None): """Indexes a single document""" if not settings.ES_LIVE_INDEXING: return if es is None: # Use es_utils.get_indexing_es() because it uses # ES_INDEXING_TIMEOUT. es = es_utils.get_indexing_es() es.index(document, index=es_utils.WRITE_INDEX, doc_type=es_utils.SUMO_DOCTYPE, id=cls.get_document_id(document['id']), bulk=bulk, force_insert=force_insert) if refresh: es.refresh(es_utils.WRITE_INDEX, timesleep=0)
def index_all(cls, percent=100): """Reindexes all the objects for this model. Yields number of documents done. Note: This can get run from the command line, so we log stuff to let the user know what's going on. :arg percent: The percentage of questions to index. Defaults to 100--e.g. all of them. """ es = es_utils.get_indexing_es() doc_type = cls._meta.db_table index = settings.ES_WRITE_INDEXES['default'] start_time = time.time() indexable_qs = cls.get_indexable() log.info('reindex %s into %s index', doc_type, index) log.info('iterating through %s....', doc_type) total = indexable_qs.count() to_index = int(total * (percent / 100.0)) log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index) if to_index == 0: log.info('done!') return total = to_index for t, obj_id in enumerate(indexable_qs): if t > total: break if t % 1000 == 0 and t > 0: time_to_go = (total - t) * ((time.time() - start_time) / t) per_1000 = (time.time() - start_time) / (t / 1000.0) log.info('%s/%s... (%s to go, %s per 1000 docs)', t, total, es_utils.format_time(time_to_go), es_utils.format_time(per_1000)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() if t % settings.ES_FLUSH_BULK_EVERY == 0: # We built the ES with this setting, but it doesn't # actually do anything with it unless we call # flush_bulk which causes it to check its bulk_size # and flush it if it's too big. es.flush_bulk() try: cls.index(cls.extract_document(obj_id), bulk=True, es=es) except Exception: log.exception('Unable to extract/index document (id: %d)', obj_id) yield t es.flush_bulk(forced=True) delta_time = time.time() - start_time log.info('done! (%s, %s per 1000 docs)', es_utils.format_time(delta_time), es_utils.format_time(delta_time / (total / 1000.0))) es.refresh()
def index_all(cls, percent=100): """Reindexes all the objects for this model. Yields number of documents done. Note: This can get run from the command line, so we log stuff to let the user know what's going on. :arg percent: The percentage of questions to index. Defaults to 100--e.g. all of them. """ es = es_utils.get_indexing_es() doc_type = cls._meta.db_table index = settings.ES_INDEXES['default'] start_time = time.time() log.info('reindex %s into %s index', doc_type, index) log.info('iterating through %s....', doc_type) total = cls.objects.count() to_index = int(total * (percent / 100.0)) log.info('total %s: %s (to be indexed: %s)', doc_type, total, to_index) total = to_index # Some models have a gazillion instances. So we want to go # through them one at a time in a way that doesn't pull all # the data into memory all at once. So we iterate through ids # and pull the objects one at a time. qs = cls.objects.order_by('id').values_list('id', flat=True) for t, obj_id in enumerate(qs.iterator()): if t > total: break obj = cls.objects.get(pk=obj_id) if t % 1000 == 0 and t > 0: time_to_go = (total - t) * ((time.time() - start_time) / t) log.info('%s/%s... (%s to go)', t, total, es_utils.format_time(time_to_go)) # We call this every 1000 or so because we're # essentially loading the whole db and if DEBUG=True, # then Django saves every sql statement which causes # our memory to go up up up. So we reset it and that # makes things happier even in DEBUG environments. reset_queries() if t % settings.ES_FLUSH_BULK_EVERY == 0: # We built the ES with this setting, but it doesn't # actually do anything with it unless we call # flush_bulk which causes it to check its bulk_size # and flush it if it's too big. es.flush_bulk() try: cls.index(obj.extract_document(), bulk=True, es=es) except Exception: log.exception('Unable to extract/index document (id: %d)', obj.id) yield t es.flush_bulk(forced=True) end_time = time.time() log.info('done! (%s)', es_utils.format_time(end_time - start_time)) es.refresh()