def _update_search_index(video): """ Updates the search team video index for that video if that video is under moderation """ if video.moderated_by: tv = TeamVideo.objects.get(video=video, team=video.moderated_by) site.get_index(TeamVideo).update_object(tv)
def handle(self, *args, **options): if len(args) != 1: raise CommandError('Usage index_team_videos <team-slug>') try: team = Team.objects.get(slug=args[0]) except Team.DoesNotExist: raise CommandError('Team with slug %r not found' % (args[0], )) video_index = site.get_index(Video) team_video_index = site.get_index(TeamVideo) self.stdout.write("Fetching videos\n") video_list = list( TeamVideo.objects.filter(team=team).select_related('video')) start_time = time.time() self.stdout.write("Indexing") self.stdout.flush() with transaction.commit_manually(): for team_video in video_list: video_index.update_object(team_video.video) team_video_index.update_object(team_video) self.stdout.write(".") self.stdout.flush() # commit after each pass to make sure that we aren't keeping # open any database locks transaction.commit() end_time = time.time() self.stdout.write("\ndone indexed %s videos in %0.1f seconds\n" % (len(video_list), end_time - start_time))
def handle(self, **options): if options['index_type'] == 'feature': feature_index = site.get_index(Feature) features = Feature.objects.filter(published=True) back.update(feature_index, features) else: recipient_index = site.get_index(Recipient) location_index = site.get_index(Location) if options['country']: index_data = Recipient.objects.select_related().filter(countrypayment=options['country'], total__gt=1000).only('name', 'geo1', 'geo2', 'geo3', 'geo4', 'zipcode', 'countrypayment') locations = Location.objects.filter(country=options['country']) else: raise ValueError('Country is required') settings.HAYSTACK_XAPIAN_PATH = "%s-%s" % (settings.HAYSTACK_XAPIAN_PATH, options['country']) back = backend.SearchBackend() print "now indexing Recipients" back.update(recipient_index, index_data) print "now indexing Location" back.update(location_index, locations) connection.close()
def add_to_index(obj_identifier, **kwargs): object_path, pk = utils.split_obj_identifier(obj_identifier) model_class = utils.get_model_class(object_path) instance = model_class.objects.get(pk=pk) index = site.get_index(model_class) index.backend.update(index, [instance])
def handle(self, *args, **options): if len(args) < 1 or len(args) > 3: raise CommandError( 'Usage profile_index <video-pk> [sort] [restrictions]') try: video = Video.objects.get(pk=args[0]) except Video.DoesNotExist: raise CommandError('Video not found: %s' % (args[0],)) try: sort = args[1] except IndexError: sort = 'cumulative' try: restrictions = args[2] except IndexError: restrictions = 10 else: if '.' in restrictions: restrictions = float(restrictions) else: restrictions = int(restrictions) video_index = site.get_index(Video) pr = cProfile.Profile() pr.enable() video_index.update_object(video) pr.disable() stats = pstats.Stats(pr, stream=self.stdout) stats.strip_dirs().sort_stats(sort).print_stats(restrictions)
def handle_app(self, app, **options): # Cause the default site to load. from haystack import site from django.db.models import get_models from haystack.exceptions import NotRegistered from haystack.query import SearchQuerySet from haystack_scheduled.indexes import ScheduledSearchIndex for model in get_models(app): try: index = site.get_index(model) except NotRegistered: if self.verbosity >= 2: print "Skipping '%s' - no index." % model continue if not isinstance(index, ScheduledSearchIndex): if self.verbosity >= 2: print "Skipping '%s' - only ScheduledSearchIndex is supported." % model continue print "'%s' - unindexing removed objects." % model existings_pks = set(map(smart_str, model.objects.values_list("pk", flat=True))) for result in SearchQuerySet().models(model): if smart_str(result.pk) not in existings_pks: if self.verbosity >= 2: print "Unindexing pk %s" % result.pk index.backend.remove(".".join([result.app_label, result.model_name, str(result.pk)]))
def _fill_cache(self): from haystack import site if self._result_cache is None: self._result_cache = [] # Tell the query where to start from and how many we'd like. cache_length = len(self._result_cache) self.query._reset() self.query.set_limits(cache_length, cache_length + ITERATOR_LOAD_PER_QUERY) results = self.query.get_results() # Check if we wish to load all objects. if self._load_all: original_results = [] models_pks = {} loaded_objects = {} # Remember the search position for each result so we don't have to resort later. for result in results: original_results.append(result) models_pks.setdefault(result.model, []).append(result.pk) # Load the objects for each model in turn. for model in models_pks: if model in self._load_all_querysets: # Use the overriding queryset. loaded_objects[model] = self._load_all_querysets[model].in_bulk(models_pks[model]) else: # Check the SearchIndex for the model for an override. try: index = site.get_index(model) qs = index.load_all_queryset() loaded_objects[model] = qs.in_bulk(models_pks[model]) except NotRegistered: # The model returned doesn't seem to be registered with # the current site. We should silently fail and populate # nothing for those objects. loaded_objects[model] = [] if len(results) < ITERATOR_LOAD_PER_QUERY: self._ignored_result_count += ITERATOR_LOAD_PER_QUERY - len(results) for result in results: if self._load_all: # We have to deal with integer keys being cast from strings; if this # fails we've got a character pk. try: result.pk = int(result.pk) except ValueError: pass try: result._object = loaded_objects[result.model][result.pk] except (KeyError, IndexError): # The object was either deleted since we indexed or should # be ignored; fail silently. self._ignored_result_count += 1 continue self._result_cache.append(result)
def _rebuild_index(self): """ Rebuilds the search index. """ from haystack import site index = site.get_index(models.Video) index.reindex()
def __init__(self, obj, admin_site=None): self.admin = admin_site self.object = obj if getattr(self.object, 'searchindex', None) is None: # < Haystack 1.2 from haystack import site self.object.searchindex = site.get_index(self.object.model)
def get_stored_fields(self): """ Returns a dictionary of all of the stored fields from the SearchIndex. Useful for serializing results. Only returns the fields Haystack's indexes are aware of as being 'stored'. """ if self._stored_fields is None: from haystack import site from haystack.exceptions import NotRegistered try: index = site.get_index(self.model) except NotRegistered: # Not found? Return nothing. return {} self._stored_fields = {} # Iterate through the index's fields, pulling out the fields that # are stored. for fieldname, field in index.fields.items(): if field.stored is True: self._stored_fields[fieldname] = getattr(self, fieldname, u'') return self._stored_fields
def get_index(self, model_class): """Fetch the model's registered ``SearchIndex`` in a standarized way.""" try: return site.get_index(model_class) except NotRegistered: self.log.error("Couldn't find a registered SearchIndex for %s." % model_class) return None
def remove_index(app_name, model_name, identifier): from haystack import site import openPLM.plmapp.search_indexes model_class = get_model(app_name, model_name) search_index = site.get_index(model_class) search_index.remove_object(identifier)
def handle(self, **options): from parliament.search.models import IndexingTask delete_tasks = list( IndexingTask.objects.filter(action='delete') ) update_tasks = list( IndexingTask.objects.filter(action='update').prefetch_related('content_object') ) solr = pysolr.Solr(settings.HAYSTACK_SOLR_URL) if update_tasks: update_objs = [t.content_object for t in update_tasks if t.content_object] update_objs.sort(key=lambda o: o.__class__.__name__) for cls, objs in itertools.groupby(update_objs, lambda o: o.__class__): print "Indexing %s" % cls index = site.get_index(cls) prepared_objs = [index.prepare(o) for o in objs] solr.add(prepared_objs) IndexingTask.objects.filter(id__in=[t.id for t in update_tasks]).delete() if delete_tasks: for dt in delete_tasks: print "Deleting %s" % dt.identifier solr.delete(id=dt.identifier, commit=False) solr.commit() IndexingTask.objects.filter(id__in=[t.id for t in delete_tasks]).delete()
def get_stored_fields(self): """ Returns a dictionary of all of the stored fields from the SearchIndex. Useful for serializing results. Only returns the fields Haystack's indexes are aware of as being 'stored'. """ if self._stored_fields is None: from haystack import site from haystack.exceptions import NotRegistered try: index = site.get_index(self.model) except NotRegistered: # Not found? Return nothing. return {} self._stored_fields = {} # Iterate through the index's fields, pulling out the fields that # are stored. for fieldname, field in index.fields.items(): if field.stored is True: self._stored_fields[fieldname] = getattr( self, fieldname, u'') return self._stored_fields
def detail(request, idea_id): """ Detail view; idea_id must be a string containing an int. """ idea = get_object_or_404(Idea, pk=int(idea_id)) if request.method == 'POST': tag_form = IdeaTagForm(request.POST) if tag_form.is_valid(): data = tag_form.clean()['tags'] tags = [tag.strip() for tag in data.split(',') if tag.strip() != ''] idea.tags.add(*tags) # Make sure the search index included the tags site.get_index(Idea).update_object(idea) return HttpResponseRedirect( reverse('idea_detail', args=(idea.id,))) else: tag_form = IdeaTagForm() voters = User.objects.filter(vote__idea=idea, vote__vote=UP_VOTE) for v in voters: try: v.profile = v.get_profile() except (ObjectDoesNotExist, SiteProfileNotAvailable): v.profile = None idea_type = ContentType.objects.get(app_label="idea", model="idea") tags = idea.tags.extra(select={ 'tag_count': """ SELECT COUNT(*) from taggit_taggeditem tt WHERE tt.tag_id = taggit_tag.id AND content_type_id = %s """ }, select_params=[idea_type.id]).order_by('name') for tag in tags: tag.tag_url = "%s?tags=%s" % (reverse('idea_list'), tag.slug) return _render(request, 'idea/detail.html', { 'idea': idea, # title, body, user name, user photo, time 'support': request.user in voters, 'tags': tags, 'voters': voters, 'tag_form': tag_form })
def update_index(app_name, model_name, pk, **kwargs): from haystack import site import openPLM.plmapp.search_indexes model_class = get_model(app_name, model_name) instance = model_class.objects.select_related(depth=1).get(pk=pk) search_index = site.get_index(model_class) search_index.update_object(instance)
def remove_search_index(model_class, obj_identifier): try: search_index = site.get_index(model_class) except NotRegistered: log(u'Search index is not registered for %s' % model_class) return None search_index.remove_object(obj_identifier)
def remove_search_index(model_class, obj_identifier): try: search_index = site.get_index(model_class) except NotRegistered: log(u'Seacrh index is not registered for %s' % model_class) return None search_index.remove_object(obj_identifier)
def handle_app(self, app, **options): # Cause the default site to load. from haystack import site from django.db.models import get_models from haystack.exceptions import NotRegistered if self.site: path_bits = self.site.split(".") module_name = ".".join(path_bits[:-1]) site_name = path_bits[-1] try: module = importlib.import_module(module_name) site = getattr(module, site_name) except (ImportError, NameError): pass for model in get_models(app): try: index = site.get_index(model) except NotRegistered: if self.verbosity >= 2: print "Skipping '%s' - no index." % model continue extra_lookup_kwargs = {} updated_field = index.get_updated_field() if self.age: if updated_field: extra_lookup_kwargs["%s__gte" % updated_field] = datetime.datetime.now() - datetime.timedelta( hours=self.age ) else: if self.verbosity >= 2: print "No updated date field found for '%s' - not restricting by age." % model.__name__ # `.select_related()` seems like a good idea here but can fail on # nullable `ForeignKey` as well as what seems like other cases. qs = index.get_queryset().filter(**extra_lookup_kwargs).order_by(model._meta.pk.name) total = qs.count() if self.verbosity >= 1: print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural)) for start in range(0, total, self.batchsize): end = min(start + self.batchsize, total) if self.verbosity >= 2: print " indexing %s - %d of %d." % (start + 1, end, total) # Get a clone of the QuerySet so that the cache doesn't bloat up # in memory. Useful when reindexing large amounts of data. small_cache_qs = qs.all() index.backend.update(index, small_cache_qs[start:end]) # Clear out the DB connections queries because it bloats up RAM. reset_queries()
def search_index_delete(app_name, model_name, obj_identifier, **kwargs): logger = search_index_delete.get_logger(**kwargs) try: model_class = get_model(app_name, model_name) search_index = site.get_index(model_class) search_index.remove_object(obj_identifier) except Exception, exc: logger.error(exc) search_index_delete.retry(exc=exc)
def update_one_team_video(team_video_id): from teams.models import TeamVideo, TeamVideoLanguage try: team_video = TeamVideo.objects.get(id=team_video_id) except TeamVideo.DoesNotExist: return tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.update(tv_search_index, [team_video])
def run(self, app_name, model_name, pk, **kwargs): logger = self.get_logger(**kwargs) try: model_class = get_model(app_name, model_name) instance = model_class.objects.get(pk=pk) search_index = site.get_index(model_class) search_index.update_object(instance) except ObjectDoesNotExist, exc: logger.warn(exc)
def update_one_team_video(team_video_id): """Update the Solr index for the given team video.""" from teams.models import TeamVideo try: team_video = TeamVideo.objects.get(id=team_video_id) except TeamVideo.DoesNotExist: return tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.update(tv_search_index, [team_video])
def update_one_team_video(team_video_id): from teams.models import TeamVideo, TeamVideoLanguage try: team_video = TeamVideo.objects.get(id=team_video_id) except TeamVideo.DoesNotExist: return tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.update( tv_search_index, [team_video])
def get_index(self, model_class, **kwargs): """ Fetch the model's registered ``SearchIndex`` in a standarized way. """ logger = self.get_logger(**kwargs) try: return index_holder.get_index(model_class) except IndexNotFoundException: logger.error("Couldn't find a SearchIndex for %s." % model_class) return None
def run(self, app_name, model_name, pk, **kwargs): logger = self.get_logger(**kwargs) try: model_class = get_model(app_name, model_name) instance = model_class.objects.get(pk=pk) search_index = site.get_index(model_class) search_index.update_object(instance) except Exception, exc: logger.error(exc) self.retry([app_name, model_name, pk], kwargs, exc=exc)
def update_index(app_name, model_name, pk, fast_reindex=False, **kwargs): from haystack import site import openPLM.plmapp.search_indexes model_class = get_model(app_name, model_name) manager = _get_manager(model_class) instance = manager.get(pk=pk) if fast_reindex: instance.fast_reindex = True search_index = site.get_index(model_class) search_index.update_object(instance)
def haystack_update_index(app_label, model_name, pk, is_removal, using="default"): """ Updates a haystack index for the given model (specified by ``app_label`` and ``model_name``). If ``is_removal`` is ``True``, a fake instance is constructed with the given ``pk`` and passed to the index's :meth:`remove_object` method. Otherwise, the latest version of the instance is fetched from the database and passed to the index's :meth:`update_object` method. If an import_app_label, import_model, and import_pk are provided, this task will spawn ``mark_import_complete``. """ model_class = get_model(app_label, model_name) search_index = site.get_index(model_class) try: if is_removal: instance = model_class(pk=pk) search_index.remove_object(instance) else: try: instance = Video.objects.using(using).get(pk=pk) except model_class.DoesNotExist: logging.debug( ("haystack_update_index(%r, %r, %r, %r, using=%r)" " could not find video with pk %i"), app_label, model_name, pk, is_removal, using, pk, ) else: if instance.status == Video.ACTIVE: search_index.update_object(instance) else: search_index.remove_object(instance) except (DatabaseLockError, LockError), e: # maximum wait is ~30s exp = min(haystack_update_index.request.retries, 4) countdown = random.random() * (2 ** exp) logging.debug( ("haystack_update_index(%r, %r, %r, %r, using=%r) " "retrying due to %s with countdown %r"), app_label, model_name, pk, is_removal, using, e.__class__.__name__, countdown, ) haystack_update_index.retry(countdown=countdown)
def update_search_index_for_qs(model_class, pks): start = time.time() qs = model_class._default_manager.filter(pk__in=pks) try: search_index = site.get_index(model_class) except NotRegistered: log(u'Seacrh index is not registered for %s' % model_class) return None search_index.backend.update(search_index, qs) LogEntry(num=len(pks), time=time.time()-start).save()
def update_search_index_for_qs(model_class, pks): start = time.time() qs = model_class._default_manager.filter(pk__in=pks) try: search_index = site.get_index(model_class) except NotRegistered: log(u'Seacrh index is not registered for %s' % model_class) return None search_index.backend.update(search_index, qs) LogEntry(num=len(pks), time=time.time() - start).save()
def update_search_index(model_class, pk): try: obj = model_class.objects.get(pk=pk) except model_class.DoesNotExist: log(u'Object does not exist for %s %s' % (model_class, pk)) return try: search_index = site.get_index(model_class) except NotRegistered: log(u'Seacrh index is not registered for %s' % model_class) return None search_index.update_object(obj)
def add_idea(request): banner = get_banner() if request.method == 'POST': idea = Idea(creator=request.user, state=state_helper.get_first_state()) if idea.state.name == 'Active': form = IdeaForm(request.POST, instance=idea) if form.is_valid(): new_idea = form.save() vote_up(new_idea, request.user) # Make sure the search index included the tags site.get_index(Idea).update_object(new_idea) return HttpResponseRedirect(reverse('idea_detail', args=(idea.id,))) else: return HttpResponse('Idea is archived', status=403) else: idea_title = request.GET.get('idea_title', '') form = IdeaForm(initial={'title':idea_title}) return _render(request, 'idea/add.html', { 'form':form, 'banner':banner, 'similar': [r.object for r in more_like_text(idea_title, Idea)] })
def handle_app(self, app, **options): # Cause the default site to load. from haystack import handle_registrations handle_registrations() from django.db.models import get_models from haystack import site from haystack.exceptions import NotRegistered for model in get_models(app): try: index = site.get_index(model) except NotRegistered: if self.verbosity >= 2: print "Skipping '%s' - no index." % model continue extra_lookup_kwargs = {} updated_field = index.get_updated_field() if self.age: if updated_field: extra_lookup_kwargs['%s__gte' % updated_field] = datetime.datetime.now() - datetime.timedelta(hours=self.age) else: if self.verbosity >= 2: print "No updated date field found for '%s' - not restricting by age." % model.__name__ # DRL_TODO: .select_related() seems like a good idea here but # can cause empty QuerySets. Why? qs = index.get_query_set().filter(**extra_lookup_kwargs).order_by(model._meta.pk.name) total = qs.count() if self.verbosity >= 1: print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural)) for start in range(0, total, self.batchsize): end = min(start + self.batchsize, total) if self.verbosity >= 2: print " indexing %s - %d of %d." % (start+1, end, total) # Get a clone of the QuerySet so that the cache doesn't bloat up # in memory. Useful when reindexing large amounts of data. small_cache_qs = qs.all() index.backend.update(index, small_cache_qs[start:end]) # Clear out the DB connections queries because it bloats up RAM. reset_queries()
def update_external(self, print_delta=0, start=0, select_related=None): """ Update search index and cached_templates for all objects """ qs = self.all() if select_related: qs = qs.select_related(*select_related) if start: qs = qs.filter(pk__gte=start) qs = queryset_iterator(qs) #context = Context(dict(STATIC_URL=settings.STATIC_URL)) search_index = site.get_index(self.model) for obj in qs: obj.update_search_index(search_index) #obj.update_cached_template(context) if print_delta and not obj.id % print_delta: print obj.id
def video_changed_tasks(video_pk, new_version_id=None): from videos import metadata_manager from videos.models import Video from teams.models import TeamVideo metadata_manager.update_metadata(video_pk) if new_version_id is not None: _send_notification(new_version_id) _check_alarm(new_version_id) _detect_language(new_version_id) video = Video.objects.get(pk=video_pk) if video.teamvideo_set.count() > 0: tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.update(tv_search_index, list(video.teamvideo_set.all())) video.update_search_index()
def extract_and_index_pdf_text(document_pk=None, **kwargs): logger = extract_and_index_pdf_text.get_logger(**kwargs) logger.debug("indexing doc with pk %s" % document_pk) try: doc = Document.objects.get(pk=document_pk) except Document.DoesNotExist: logger.warning("Warning, Document with pk %s does not exist" % str(document_pk)) return False if not doc.pages or doc.mimetype != 'application/pdf': logger.info("Warning, doc.pages (%s) not set or doc.mimetype (%s) != 'application/pdf'" % (str(doc.pages), str(doc.mimetype))) return False #logger.debug("filename path %s %s" % (str(doc.file.path), str(doc.file.name))) for p in xrange(1, doc.pages + 1): text = pdf2text(doc.file.path, p) doc.page_set.create(num=p, text=text) index = site.get_index(Page) index.backend.update(index, doc.page_set.all()) return True
def video_changed_tasks(video_pk, new_version_id=None): from videos import metadata_manager from videos.models import Video from teams.models import TeamVideo metadata_manager.update_metadata(video_pk) if new_version_id is not None: _send_notification(new_version_id) _check_alarm(new_version_id) _detect_language(new_version_id) video = Video.objects.get(pk=video_pk) if video.teamvideo_set.count() > 0: tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.update( tv_search_index, list(video.teamvideo_set.all())) video.update_search_index()
def _get_team_video_from_search_record(search_record): if getattr(search_record, '_team_video', None): # This is ugly, but allows us to pre-fetch the teamvideos for the # search records all at once to avoid multiple DB queries. return search_record._team_video else: try: return TeamVideo.objects.get(pk=search_record.team_video_pk) except TeamVideo.DoesNotExist: from raven.contrib.django.models import client client.create_from_exception() # ok, for some reason, this search record got stale. # no idea why. # so let's delete it so this can't happen again tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.remove(search_record.id) logger.error("Removing %s from solr since it's stale" % search_record.id) return None
def _get_team_video_from_search_record(search_record): if getattr(search_record, '_team_video', None): # This is ugly, but allows us to pre-fetch the teamvideos for the # search records all at once to avoid multiple DB queries. return search_record._team_video else: try: return TeamVideo.objects.get(pk=search_record.team_video_pk) except TeamVideo.DoesNotExist: logger.warn('DoesNotExist error when looking up search record', exc_info=True) # ok, for some reason, this search record got stale. # no idea why. # so let's delete it so this can't happen again tv_search_index = site.get_index(TeamVideo) tv_search_index.backend.remove(search_record.id) logger.error("Removing %s from solr since it's stale" % search_record.id) return None
def handle(self, **options): from parliament.search.models import IndexingTask delete_tasks = list(IndexingTask.objects.filter(action='delete')) update_tasks = list( IndexingTask.objects.filter( action='update').prefetch_related('content_object')) solr = pysolr.Solr(settings.HAYSTACK_SOLR_URL, timeout=600) if update_tasks: update_objs = [ t.content_object for t in update_tasks if t.content_object ] update_objs.sort(key=lambda o: o.__class__.__name__) for cls, objs in itertools.groupby(update_objs, lambda o: o.__class__): logger.debug("Indexing %s" % cls) index = site.get_index(cls) if hasattr(index, 'should_obj_be_indexed'): objs = filter(index.should_obj_be_indexed, objs) prepared_objs = [index.prepare(o) for o in objs] solr.add(prepared_objs) IndexingTask.objects.filter( id__in=[t.id for t in update_tasks]).delete() if delete_tasks: for dt in delete_tasks: print "Deleting %s" % dt.identifier solr.delete(id=dt.identifier, commit=False) solr.commit() IndexingTask.objects.filter( id__in=[t.id for t in delete_tasks]).delete()
def handle(self, *args, **options): self.queued_versions = [] self.last_index_time = {} self.video_index = site.get_index(Video) self.last_fetch_all_videos_time = 0 self.last_fetch_popular_videos_time = 0 self.all_video_queue = [] self.popular_video_queue = [] time_per_version = 1.0 / float(options.get('rate', 1)) while True: start_time = time.time() if not self.queued_versions: self.queue_up_versions() queue_time = time.time() - start_time self.stdout.write("queue_up_versions() took %0.3fs seconds\n" % queue_time) start_time = time.time() video_id = self.index_one_version() index_time = time.time() - start_time self.stdout.write("indexing %s took %0.3f seconds\n" % ( video_id, index_time)) if index_time < time_per_version: time.sleep(time_per_version - index_time)
def _process_results(self, raw_results, highlight=False, result_class=None): if not self.site: from haystack import site else: site = self.site results = [] hits = raw_results.hits facets = {} spelling_suggestion = None if result_class is None: result_class = SearchResult if hasattr(raw_results, 'facets'): facets = { 'fields': raw_results.facets.get('facet_fields', {}), 'dates': raw_results.facets.get('facet_dates', {}), 'queries': raw_results.facets.get('facet_queries', {}), } for key in ['fields']: for facet_field in facets[key]: # Convert to a two-tuple, as Solr's json format returns a list of # pairs. facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2]) if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: if hasattr(raw_results, 'spellcheck'): if len(raw_results.spellcheck.get('suggestions', [])): # For some reason, it's an array of pairs. Pull off the # collated result from the end. spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1] indexed_models = site.get_indexed_models() for raw_result in raw_results.docs: app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self.conn._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) del(additional_fields['score']) if raw_result[ID] in getattr(raw_results, 'highlighting', {}): additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]] result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], searchsite=self.site, **additional_fields) results.append(result) else: hits -= 1 return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def reindex_team_videos(self): site.get_index(TeamVideo).reindex()
def handle_app(self, app, **options): from django.db.models import get_models from haystack.exceptions import NotRegistered site = get_site(self.site) if self.workers > 0: import multiprocessing for model in get_models(app): try: index = site.get_index(model) except NotRegistered: if self.verbosity >= 2: print "Skipping '%s' - no index." % model continue qs = build_queryset(index, model, age=self.age, verbosity=self.verbosity) total = qs.count() if self.verbosity >= 1: print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural)) pks_seen = set([smart_str(pk) for pk in qs.values_list('pk', flat=True)]) if self.workers > 0: ghetto_queue = [] for start in range(0, total, self.batchsize): end = min(start + self.batchsize, total) if self.workers == 0: do_update(index, qs, start, end, total, self.verbosity) else: ghetto_queue.append(('do_update', model, start, end, total, self.site, self.age, self.verbosity)) if self.workers > 0: pool = multiprocessing.Pool(self.workers) pool.map(worker, ghetto_queue) if self.remove: if self.age or total <= 0: # They're using a reduced set, which may not incorporate # all pks. Rebuild the list with everything. qs = index.index_queryset().values_list('pk', flat=True) pks_seen = set([smart_str(pk) for pk in qs]) total = len(pks_seen) if self.workers > 0: ghetto_queue = [] for start in range(0, total, self.batchsize): upper_bound = start + self.batchsize if self.workers == 0: do_remove(index, model, pks_seen, start, upper_bound) else: ghetto_queue.append(('do_remove', model, pks_seen, start, upper_bound, self.site, self.verbosity)) if self.workers > 0: pool = multiprocessing.Pool(self.workers) pool.map(worker, ghetto_queue)
def _process_results(self, raw_results, highlight=False, result_class=None): if not self.site: from haystack import site else: site = self.site results = [] hits = raw_results.hits facets = {} spelling_suggestion = None if result_class is None: result_class = SearchResult if hasattr(raw_results, 'facets'): facets = { 'fields': raw_results.facets.get('facet_fields', {}), 'dates': raw_results.facets.get('facet_dates', {}), 'queries': raw_results.facets.get('facet_queries', {}), } for key in ['fields']: for facet_field in facets[key]: # Convert to a two-tuple, as Solr's json format returns a list of # pairs. facets[key][facet_field] = zip( facets[key][facet_field][::2], facets[key][facet_field][1::2]) if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: if hasattr(raw_results, 'spellcheck'): if len(raw_results.spellcheck.get('suggestions', [])): # For some reason, it's an array of pairs. Pull off the # collated result from the end. spelling_suggestion = raw_results.spellcheck.get( 'suggestions')[-1] indexed_models = site.get_indexed_models() for raw_result in raw_results.docs: app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self.conn._to_python( value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) del (additional_fields['score']) if raw_result[ID] in getattr(raw_results, 'highlighting', {}): additional_fields[ 'highlighted'] = raw_results.highlighting[ raw_result[ID]] result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], searchsite=self.site, **additional_fields) results.append(result) else: hits -= 1 return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, }
def handle_app(self, app, **options): # Cause the default site to load. from haystack import site from django.db.models import get_models from haystack.exceptions import NotRegistered if self.site: path_bits = self.site.split('.') module_name = '.'.join(path_bits[:-1]) site_name = path_bits[-1] try: module = importlib.import_module(module_name) site = getattr(module, site_name) except (ImportError, NameError): pass for model in get_models(app): try: index = site.get_index(model) except NotRegistered: if self.verbosity >= 2: print "Skipping '%s' - no index." % model continue extra_lookup_kwargs = {} updated_field = index.get_updated_field() if self.age: if updated_field: extra_lookup_kwargs['%s__gte' % updated_field] = datetime.datetime.now() - datetime.timedelta(hours=self.age) else: if self.verbosity >= 2: print "No updated date field found for '%s' - not restricting by age." % model.__name__ # `.select_related()` seems like a good idea here but can fail on # nullable `ForeignKey` as well as what seems like other cases. qs = index.get_queryset().filter(**extra_lookup_kwargs).order_by(model._meta.pk.name) total = qs.count() if self.verbosity >= 1: print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural)) pks_seen = set() for start in range(0, total, self.batchsize): end = min(start + self.batchsize, total) # Get a clone of the QuerySet so that the cache doesn't bloat up # in memory. Useful when reindexing large amounts of data. small_cache_qs = qs.all() current_qs = small_cache_qs[start:end] for obj in current_qs: pks_seen.add(smart_str(obj.pk)) if self.verbosity >= 2: print " indexing %s - %d of %d." % (start+1, end, total) index.backend.update(index, current_qs) # Clear out the DB connections queries because it bloats up RAM. reset_queries() if self.remove: if self.age or total <= 0: # They're using a reduced set, which may not incorporate # all pks. Rebuild the list with everything. pks_seen = set() qs = index.get_queryset().values_list('pk', flat=True) total = qs.count() for pk in qs: pks_seen.add(smart_str(pk)) for start in range(0, total, self.batchsize): upper_bound = start + self.batchsize # Fetch a list of results. # Can't do pk range, because id's are strings (thanks comments # & UUIDs!). stuff_in_the_index = SearchQuerySet().models(model)[start:upper_bound] # Iterate over those results. for result in stuff_in_the_index: # Be careful not to hit the DB. if not smart_str(result.pk) in pks_seen: # The id is NOT in the small_cache_qs, issue a delete. if self.verbosity >= 2: print " removing %s." % result.pk index.backend.remove(".".join([result.app_label, result.model_name, result.pk]))