def test_recent_helpful_votes(self): """Recent helpful votes are indexed properly.""" # Create a document and verify it doesn't show up in a # query for recent_helpful_votes__gt=0. r = revision(is_approved=True, save=True) self.refresh() eq_(Document.search().filter( document_recent_helpful_votes__gt=0).count(), 0) # Add an unhelpful vote, it still shouldn't show up. helpful_vote(revision=r, helpful=False, save=True) r.document.save() # Votes don't trigger a reindex. self.refresh() eq_(Document.search().filter( document_recent_helpful_votes__gt=0).count(), 0) # Add an helpful vote created 31 days ago, it still shouldn't show up. created = datetime.now() - timedelta(days=31) helpful_vote(revision=r, helpful=True, created=created, save=True) r.document.save() # Votes don't trigger a reindex. self.refresh() eq_(Document.search().filter( document_recent_helpful_votes__gt=0).count(), 0) # Add an helpful vote created 29 days ago, it should show up now. created = datetime.now() - timedelta(days=29) helpful_vote(revision=r, helpful=True, created=created, save=True) r.document.save() # Votes don't trigger a reindex. self.refresh() eq_(Document.search().filter( document_recent_helpful_votes__gt=0).count(), 1)
def document(save=False, **kwargs): """Return an empty document with enough stuff filled out that it can be saved.""" defaults = {'category': CATEGORIES[0][0], 'title': str(datetime.now())} defaults.update(kwargs) if 'slug' not in kwargs: defaults['slug'] = slugify(defaults['title']) d = Document(**defaults) if save: d.save() return d
def test_add_and_delete(self): """Adding a doc should add it to the search index; deleting should delete it.""" doc = document(save=True) revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().count(), 1) doc.delete() self.refresh() eq_(Document.search().count(), 0)
def document(save=False, **kwargs): """Return an empty document with enough stuff filled out that it can be saved.""" defaults = {"category": CATEGORIES[0][0], "title": str(datetime.now())} defaults.update(kwargs) if "slug" not in kwargs: defaults["slug"] = slugify(defaults["title"]) d = Document(**defaults) if save: d.save() return d
def test_wiki_no_revisions(self): """Don't index documents without approved revisions""" # Create a document with no revisions and make sure the # document is not in the index. doc = document(save=True) self.refresh() eq_(Document.search().count(), 0) # Create a revision that's not approved and make sure the # document is still not in the index. revision(document=doc, is_approved=False, save=True) self.refresh() eq_(Document.search().count(), 0)
def test_wiki_keywords(self): """Make sure updating keywords updates the index.""" # Create a document with a revision with no keywords. It # shouldn't show up with a document_keywords term query for # 'wool' since it has no keywords. doc = document(title=u'wool hats') doc.save() revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().query(document_keywords='wool').count(), 0) revision(document=doc, is_approved=True, keywords='wool', save=True) self.refresh() eq_(Document.search().query(document_keywords='wool').count(), 1)
def test_wiki_redirects(self): """Make sure we don't index redirects""" # First create a revision that doesn't have a redirect and # make sure it's in the index. doc = document(title=u"wool hats") doc.save() revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().query("wool").count(), 1) # Now create a revision that is a redirect and make sure the # document is removed from the index. revision(document=doc, content=REDIRECT_CONTENT, is_approved=True, save=True) self.refresh() eq_(Document.search().query("wool").count(), 0)
def document(save=False, **kwargs): """Return an empty document with enough stuff filled out that it can be saved.""" defaults = { 'category': CATEGORIES[0][0], 'title': str(datetime.now()), 'is_redirect': 0 } defaults.update(kwargs) if 'slug' not in kwargs: defaults['slug'] = slugify(defaults['title']) d = Document(**defaults) if save: d.save() return d
def test_document_translate_fallback(self): d_en = document(locale='en-US', title=u'How to delete Google Chrome?', save=True) invalid_translate = reverse('wiki.document', locale='tr', args=[d_en.slug]) self.assertEqual(d_en, Document.from_url(invalid_translate))
def document(**kwargs): """Return an empty document with enough stuff filled out that it can be saved.""" defaults = {'category': CATEGORIES[0][0], 'title': str(datetime.now())} defaults.update(kwargs) if 'slug' not in kwargs: defaults['slug'] = slugify(defaults['title']) return Document(**defaults)
def _postdoc(request, is_image): docpath = u'{0}/{1}'.format(settings.MEDIA_ROOT, is_image and u'images' or u'documents') if not os.path.exists(docpath): os.mkdir(docpath) f = request.FILES[u'file'] fd = open(u'{0}/{1}'.format(docpath, f.name), u'wb+') for chunk in f.chunks(): fd.write(chunk) fd.close() url = u'{0}/{1}/{2}'.format(settings.MEDIA_URL, is_image and u'images' or u'documents', f.name) try: doc = Document.objects.get(path=url) except Document.DoesNotExist: doc = Document() doc.is_image = is_image doc.path = url doc.wikipath = request.POST[u'page'] doc.save() return HttpResponse(doc.path)
def test_wiki_redirects(self): """Make sure we don't index redirects""" # First create a revision that doesn't have a redirect and # make sure it's in the index. doc = document(title=u'wool hats') doc.save() revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().query('wool').count(), 1) # Now create a revision that is a redirect and make sure the # document is removed from the index. revision(document=doc, content=REDIRECT_CONTENT, is_approved=True, save=True) self.refresh() eq_(Document.search().query('wool').count(), 0)
def _es_documents_for(locale, topics, products): """ES implementation of documents_for.""" s = Document.search().values_dict( 'id', 'document_title', 'url').filter(document_locale=locale) for topic in topics: s = s.filter(document_topic=topic.slug) for product in products or []: s = s.filter(document_product=product.slug) return list(s.order_by('-document_recent_helpful_votes')[:100])
def test_translations_get_parent_tags(self): doc1 = document(title=u'Audio too loud') doc1.save() revision(document=doc1, is_approved=True, save=True) doc1.tags.add(u'desktop') doc1.tags.add(u'windows') doc2 = document(title=u'Audio too loud bork bork', parent=doc1) doc2.save() revision(document=doc2, is_approved=True, save=True) doc2.tags.add(u'badtag') # Verify the parent has the right tags. doc_dict = Document.extract_document(doc1.id) eq_(doc_dict['tag'], [u'desktop', u'windows']) # Verify the translation has the parent's tags. doc_dict = Document.extract_document(doc2.id) eq_(doc_dict['tag'], [u'desktop', u'windows'])
def suggestions(request): """A simple search view that returns OpenSearch suggestions.""" mimetype = 'application/x-suggestions+json' term = request.GET.get('q') if not term: return HttpResponseBadRequest(mimetype=mimetype) site = Site.objects.get_current() locale = locale_or_default(request.locale) try: query = dict(('%s__text' % field, term) for field in Document.get_query_fields()) wiki_s = (Document.search() .filter(document_is_archived=False) .filter(document_locale=locale) .values_dict('document_title', 'url') .query(or_=query)[:5]) query = dict(('%s__text' % field, term) for field in Question.get_query_fields()) question_s = (Question.search() .filter(question_has_helpful=True) .values_dict('question_title', 'url') .query(or_=query)[:5]) results = list(chain(question_s, wiki_s)) except (ESTimeoutError, ESMaxRetryError, ESException): # If we have ES problems, we just send back an empty result # set. results = [] urlize = lambda r: u'https://%s%s' % (site, r['url']) titleize = lambda r: (r['document_title'] if 'document_title' in r else r['question_title']) data = [term, [titleize(r) for r in results], [], [urlize(r) for r in results]] return HttpResponse(json.dumps(data), mimetype=mimetype)
def suggestions(request): """A simple search view that returns OpenSearch suggestions.""" mimetype = 'application/x-suggestions+json' term = request.GET.get('q') if not term: return HttpResponseBadRequest(mimetype=mimetype) site = Site.objects.get_current() locale = locale_or_default(request.LANGUAGE_CODE) try: query = dict(('%s__text' % field, term) for field in Document.get_query_fields()) wiki_s = (Document.search() .filter(document_is_archived=False) .filter(document_locale=locale) .values_dict('document_title', 'url') .query(or_=query)[:5]) query = dict(('%s__text' % field, term) for field in Question.get_query_fields()) question_s = (Question.search() .filter(question_has_helpful=True) .values_dict('question_title', 'url') .query(or_=query)[:5]) results = list(chain(question_s, wiki_s)) except (ESTimeoutError, ESMaxRetryError, ESException): # If we have ES problems, we just send back an empty result # set. results = [] urlize = lambda r: u'https://%s%s' % (site, r['url']) titleize = lambda r: (r['document_title'] if 'document_title' in r else r['question_title']) data = [term, [titleize(r) for r in results], [], [urlize(r) for r in results]] return HttpResponse(json.dumps(data), mimetype=mimetype)
def _postdoc(request, is_image): docpath = u'{0}/{1}'.format( settings.MEDIA_ROOT, is_image and u'images' or u'documents' ) if not os.path.exists(docpath): os.mkdir(docpath) f = request.FILES[u'file'] fd = open(u'{0}/{1}'.format(docpath, f.name), u'wb+') for chunk in f.chunks(): fd.write(chunk) fd.close() url = u'{0}/{1}/{2}'.format( settings.MEDIA_URL, is_image and u'images' or u'documents', f.name) try: doc = Document.objects.get(path=url) except Document.DoesNotExist: doc = Document() doc.is_image = is_image doc.path = url doc.wikipath = request.POST[u'page'] doc.save() return HttpResponse(doc.path)
def test_translations_get_parent_tags(self): doc1 = document(title=u"Audio too loud") doc1.save() revision(document=doc1, is_approved=True, save=True) doc1.topics.add(topic(slug="cookies", save=True)) doc1.topics.add(topic(slug="general", save=True)) doc1.products.add(product(slug="desktop", save=True)) doc2 = document(title=u"Audio too loud bork bork", parent=doc1) doc2.save() revision(document=doc2, is_approved=True, save=True) doc2.tags.add(u"badtag") # Verify the parent has the right tags. doc_dict = Document.extract_document(doc1.id) eq_(doc_dict["document_topic"], [u"cookies", u"general"]) eq_(doc_dict["document_product"], [u"desktop"]) # Verify the translation has the parent's tags. doc_dict = Document.extract_document(doc2.id) eq_(doc_dict["document_topic"], [u"cookies", u"general"]) eq_(doc_dict["document_product"], [u"desktop"])
def test_wiki_tags(self): """Make sure that adding tags to a Document causes it to refresh the index. """ tag = u'hiphop' eq_(Document.search().filter(tag=tag).count(), 0) doc = document(save=True) revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().filter(tag=tag).count(), 0) doc.tags.add(tag) self.refresh() eq_(Document.search().filter(tag=tag).count(), 1) doc.tags.remove(tag) self.refresh() # Make sure the document itself is still there and that we didn't # accidentally delete it through screwed up signal handling: eq_(Document.search().filter().count(), 1) eq_(Document.search().filter(tag=tag).count(), 0)
def _es_documents_for(locale, topics, products=None): """ES implementation of documents_for.""" s = (Document.search() .values_dict('id', 'document_title', 'url', 'document_parent_id') .filter(document_locale=locale, document_is_archived=False, document_category__in=settings.IA_DEFAULT_CATEGORIES)) for topic in topics: s = s.filter(document_topic=topic.slug) for product in products or []: s = s.filter(document_product=product.slug) return list(s.order_by('-document_recent_helpful_votes')[:100])
def test_wiki_products(self): """Make sure that adding products to a Document causes it to refresh the index. """ p = product(slug=u'desktop', save=True) eq_(Document.search().filter(product=p.slug).count(), 0) doc = document(save=True) revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().filter(product=p.slug).count(), 0) doc.products.add(p) self.refresh() eq_(Document.search().filter(product=p.slug).count(), 1) doc.products.remove(p) self.refresh() # Make sure the document itself is still there and that we didn't # accidentally delete it through screwed up signal handling: eq_(Document.search().filter().count(), 1) eq_(Document.search().filter(product=p.slug).count(), 0)
def test_wiki_products(self): """Make sure that adding products to a Document causes it to refresh the index. """ p = product(slug=u'desktop', save=True) eq_(Document.search().filter(document_product=p.slug).count(), 0) doc = document(save=True) revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().filter(document_product=p.slug).count(), 0) doc.products.add(p) self.refresh() eq_(Document.search().filter(document_product=p.slug).count(), 1) doc.products.remove(p) self.refresh() # Make sure the document itself is still there and that we didn't # accidentally delete it through screwed up signal handling: eq_(Document.search().filter().count(), 1) eq_(Document.search().filter(document_product=p.slug).count(), 0)
def test_wiki_topics(self): """Make sure that adding topics to a Document causes it to refresh the index. """ t = topic(slug=u'hiphop', save=True) eq_(Document.search().filter(document_topic=t.slug).count(), 0) doc = document(save=True) revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().filter(document_topic=t.slug).count(), 0) doc.topics.add(t) self.refresh() eq_(Document.search().filter(document_topic=t.slug).count(), 1) doc.topics.clear() self.refresh() # Make sure the document itself is still there and that we didn't # accidentally delete it through screwed up signal handling: eq_(Document.search().filter().count(), 1) eq_(Document.search().filter(document_topic=t.slug).count(), 0)
def test_translations_get_parent_tags(self): doc1 = document(title=u'Audio too loud') doc1.save() revision(document=doc1, is_approved=True, save=True) doc1.topics.add(topic(slug='cookies', save=True)) doc1.topics.add(topic(slug='general', save=True)) doc1.products.add(product(slug='desktop', save=True)) doc2 = document(title=u'Audio too loud bork bork', parent=doc1) doc2.save() revision(document=doc2, is_approved=True, save=True) doc2.tags.add(u'badtag') # Verify the parent has the right tags. doc_dict = Document.extract_document(doc1.id) eq_(doc_dict['document_topic'], [u'cookies', u'general']) eq_(doc_dict['document_product'], [u'desktop']) # Verify the translation has the parent's tags. doc_dict = Document.extract_document(doc2.id) eq_(doc_dict['document_topic'], [u'cookies', u'general']) eq_(doc_dict['document_product'], [u'desktop'])
def test_wiki_topics(self): """Make sure that adding topics to a Document causes it to refresh the index. """ t = topic(slug=u"hiphop", save=True) eq_(Document.search().filter(document_topic=t.slug).count(), 0) doc = document(save=True) revision(document=doc, is_approved=True, save=True) self.refresh() eq_(Document.search().filter(document_topic=t.slug).count(), 0) doc.topics.add(t) self.refresh() eq_(Document.search().filter(document_topic=t.slug).count(), 1) doc.topics.clear() self.refresh() # Make sure the document itself is still there and that we didn't # accidentally delete it through screwed up signal handling: eq_(Document.search().filter().count(), 1) eq_(Document.search().filter(document_topic=t.slug).count(), 0)
def test_translations_get_parent_tags(self): doc1 = document(title=u'Audio too loud') doc1.save() revision(document=doc1, is_approved=True, save=True) doc1.topics.add(topic(slug='cookies', save=True)) doc1.topics.add(topic(slug='general', save=True)) doc1.products.add(product(slug='desktop', save=True)) doc2 = document(title=u'Audio too loud bork bork', parent=doc1) doc2.save() revision(document=doc2, is_approved=True, save=True) doc2.tags.add(u'badtag') # Verify the parent has the right tags. doc_dict = Document.extract_document(doc1.id) eq_(doc_dict['topic'], [u'cookies', u'general']) eq_(doc_dict['product'], [u'desktop']) # Verify the translation has the parent's tags. doc_dict = Document.extract_document(doc2.id) eq_(doc_dict['topic'], [u'cookies', u'general']) eq_(doc_dict['product'], [u'desktop'])
class UntranslatedReadout(Readout): title = _lazy(u'Untranslated') short_title = _lazy(u'Untranslated') details_link_text = _lazy(u'All untranslated articles...') slug = 'untranslated' column4_label = _lazy(u'Updated') def _query_and_params(self, max): # Incidentally, we tried this both as a left join and as a search # against an inner query returning translated docs, and the left join # yielded a faster-looking plan (on a production corpus). # # Find non-archived, localizable documents in categories 10, # 20 and 60 having at least one ready- for-localization # revision. Of those, show the ones that have no translation. return ('SELECT parent.slug, parent.title, ' 'wiki_revision.reviewed, dashboards_wikidocumentvisits.visits ' 'FROM wiki_document parent ' 'INNER JOIN wiki_revision ON ' 'parent.latest_localizable_revision_id=wiki_revision.id ' 'LEFT JOIN wiki_document translated ON ' 'parent.id=translated.parent_id AND translated.locale=%s ' 'LEFT JOIN dashboards_wikidocumentvisits ON ' 'parent.id=dashboards_wikidocumentvisits.document_id AND ' 'dashboards_wikidocumentvisits.period=%s ' 'WHERE ' 'translated.id IS NULL AND parent.is_localizable AND ' 'parent.category in (10, 20, 60) AND ' 'parent.locale=%s AND NOT parent.is_archived ' 'AND wiki_revision.content NOT LIKE "REDIRECT%%" ' + self._order_clause() + self._limit_clause(max), (self.locale, LAST_30_DAYS, settings.WIKI_DEFAULT_LANGUAGE)) def _order_clause(self): return ('ORDER BY wiki_revision.reviewed DESC, parent.title ASC' if self.mode == MOST_RECENT else 'ORDER BY dashboards_wikidocumentvisits.visits DESC, ' 'parent.title ASC') def _format_row(self, (slug, title, reviewed, visits)): # Run the data through the model to (potentially) format it and # take advantage of SPOTs (like for get_absolute_url()): d = Document(slug=slug, title=title, locale=settings.WIKI_DEFAULT_LANGUAGE) return dict(title=d.title, url=d.get_absolute_url(), visits=visits, updated=reviewed)
def troubleshooting_view(request): # Build a list of the most recently indexed 50 wiki documents. last_50_indexed = list(_fix_value_dicts(Document.search().values_dict().order_by("-indexed_on")[:50])) last_50_reviewed = list( Document.uncached.filter(current_revision__is_approved=True).order_by("-current_revision__reviewed")[:50] ) diff_list = diff_it_for_realz(last_50_indexed, last_50_reviewed) return render_to_response( "search/admin/troubleshooting.html", {"title": "Index Troubleshooting", "diffs": diff_list}, RequestContext(request, {}), )
def reload_period_from_analytics(cls, period): """Replace the stats for the given period from Google Analytics.""" counts = googleanalytics.pageviews_by_document(*period_dates(period)) if counts: # Delete and remake the rows: # Horribly inefficient until # http://code.djangoproject.com/ticket/9519 is fixed. cls.objects.filter(period=period).delete() for doc_id, visits in counts.iteritems(): cls.objects.create(document=Document(pk=doc_id), visits=visits, period=period) else: # Don't erase interesting data if there's nothing to replace it: log.warning('Google Analytics returned no interesting data,' ' so I kept what I had.')
def reload_period_from_json(cls, period, json_data): """Replace the stats for the given period with the given JSON.""" counts = cls._visit_counts(json_data) if counts: # Delete and remake the rows: # Horribly inefficient until # http://code.djangoproject.com/ticket/9519 is fixed. cls.objects.filter(period=period).delete() for doc_id, visits in counts.iteritems(): cls.objects.create(document=Document(pk=doc_id), visits=visits, period=period) else: # Don't erase interesting data if there's nothing to replace it: log.warning('WebTrends returned no interesting data, so I kept ' 'what I had.')
def _es_documents_for(locale, topics=None, products=None): """ES implementation of documents_for.""" s = (Document.search().values_dict( 'id', 'document_title', 'url', 'document_parent_id', 'document_summary').filter( document_locale=locale, document_is_archived=False, document_category__in=settings.IA_DEFAULT_CATEGORIES)) for topic in topics or []: s = s.filter(topic=topic.slug) for product in products or []: s = s.filter(product=product.slug) return list(s.order_by('-document_recent_helpful_votes')[:100])
def pageviews_by_document(start_date, end_date): """Return the number of pageviews by document in a given date range. * Only returns en-US documents for now since that's what we did with webtrends. Returns a dict with pageviews for each document: {<document_id>: <pageviews>, 1: 42, 7: 1337,...} """ counts = {} request = _build_request() start_index = 1 max_results = 10000 while True: # To deal with pagination @retry_503 def _make_request(): return request.get( ids='ga:' + profile_id, start_date=str(start_date), end_date=str(end_date), metrics='ga:pageviews', dimensions='ga:pagePath', filters='ga:pagePathLevel2==/kb/;ga:pagePathLevel1==/en-US/', max_results=max_results, start_index=start_index).execute() results = _make_request() for result in results['rows']: path = result[0] pageviews = int(result[1]) doc = Document.from_url(path, id_only=True, check_host=False) if not doc: continue # The same document can appear multiple times due to url params. counts[doc.pk] = counts.get(doc.pk, 0) + pageviews # Move to next page of results. start_index += max_results if start_index > results['totalResults']: break return counts
def troubleshooting_view(request): # Build a list of the most recently indexed 50 wiki documents. last_50_indexed = list( _fix_value_dicts( Document.search().values_dict().order_by('-indexed_on')[:50])) last_50_reviewed = list( Document.uncached.filter(current_revision__is_approved=True).order_by( '-current_revision__reviewed')[:50]) diff_list = diff_it_for_realz(last_50_indexed, last_50_reviewed) return render_to_response('search/admin/troubleshooting.html', { 'title': 'Index Troubleshooting', 'diffs': diff_list, }, RequestContext(request, {}))
def _create_document(title='Test Document'): d = Document(title=title, html='<div>Lorem Ipsum</div>', category=1, locale='en-US') d.save() r = Revision(document=d, keywords='key1, key2', summary='lipsum', content='<div>Lorem Ipsum</div>', creator_id=118577, significance=SIGNIFICANCES[0][0]) r.save() d.current_revision = r d.save() return d
def products_for(topics): """Returns a list of products that apply to passed in topics. :arg topics: a list of Topic instances """ product_field = 'document_product' s = Document.search().values_dict('id') for topic in topics: s = s.filter(document_topic=topic.slug) s = s.facet(product_field, filtered=True) facet_counts = s.facet_counts()[product_field] products = Product.objects.filter( slug__in=[f['term'] for f in facet_counts]).filter(visible=True) return products
def topics_for(products): """Returns a list of topics that apply to passed in products. :arg topics: a list of Product instances """ topic_field = 'document_topic' s = Document.search().values_dict('id') for product in products: s = s.filter(document_product=product.slug) s = s.facet(topic_field, filtered=True) facet_counts = s.facet_counts()[topic_field] topics = Topic.objects.filter( slug__in=[f['term'] for f in facet_counts]).filter(visible=True) return topics
def troubleshooting_view(request): # Build a list of the most recently indexed 50 wiki documents. last_50_indexed = _fix_value_dicts(Document.search() .values_dict() .order_by('-indexed_on')[:50]) last_50_reviewed = (Document.uncached .filter(current_revision__is_approved=True) .order_by('-current_revision__reviewed')[:50]) return render_to_response( 'search/admin/troubleshooting.html', {'title': 'Index Troubleshooting', 'last_50_indexed': last_50_indexed, 'last_50_reviewed': last_50_reviewed }, RequestContext(request, {}))
def _visit_counts(cls, json_data): """Given WebTrends JSON data, return a dict of doc IDs and visits: {document ID: number of visits, ...} If there is no interesting data in the given JSON, return {}. """ # We're very defensive here, as WebTrends has been known to return # invalid garbage of various sorts. try: data = json.loads(json_data)['data'] except (ValueError, KeyError, TypeError): raise StatsException('Error extracting data from WebTrends JSON') try: pages = (data[data.keys()[0]]['SubRows'] if data.keys() else {}).iteritems() except (AttributeError, IndexError, KeyError, TypeError): raise StatsException('Error extracting pages from WebTrends data') counts = {} for url, page_info in pages: doc = Document.from_url( url, required_locale=settings.LANGUAGE_CODE, id_only=True, check_host=False) if not doc: continue # Get visit count: try: visits = int(page_info['measures']['Visits']) except (ValueError, KeyError, TypeError): continue # Sometimes WebTrends repeats a URL modulo a space, etc. These can # resolve to the same document. An arbitrary one wins. # TODO: Should we be summing these? if doc.pk in counts: log.info('WebTrends has the following duplicate URL for this ' 'document: %s' % url) counts[doc.pk] = visits return counts
def troubleshooting_view(request): # Build a list of the most recently indexed 50 wiki documents. last_50_indexed = list(_fix_value_dicts(Document.search() .values_dict() .order_by('-indexed_on')[:50])) last_50_reviewed = list(Document.uncached .filter(current_revision__is_approved=True) .order_by('-current_revision__reviewed')[:50]) diff_list = diff_it_for_realz(last_50_indexed, last_50_reviewed) return render( request, 'admin/search_troubleshooting.html', {'title': 'Index Troubleshooting', 'diffs': diff_list, })
def documents_for(locale, topics, products=None): """Returns a list of articles that apply to passed in topics and products. :arg locale: the locale :arg topics: a list of Topic instances :arg products: (optional) a list of Product instances The articles are returned as a list of dicts with the following keys: id document_title url """ s = Document.search().values_dict('id', 'document_title', 'url') for topic in topics: s = s.filter(document_topic=topic.slug) for product in products or []: s = s.filter(document_product=product.slug) return list(s)
def _search_suggestions(request, text, locale, tags, product_slugs): """Return an iterable of the most relevant wiki pages and questions. :arg text: full text to search on :arg locale: locale to limit to :arg tags: list of tags to filter questions on :arg product_slugs: list of product slugs to filter articles on (["desktop", "mobile", ...]) Items are dicts of:: { 'type': 'search_summary': 'title': 'url': 'object': } :returns: up to 3 wiki pages, then up to 3 questions. """ # TODO: this can be reworked to pull data from ES rather than # hit the db. question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply product filters if product_slugs: wiki_s = wiki_s.filter(document_product__in=product_slugs) if tags: question_s = question_s.filter(question_tag__in=tags) results = [] try: query = dict(('%s__text' % field, text) for field in Document.get_query_fields()) raw_results = ( wiki_s.filter(document_locale=locale, document_category__in=default_categories) .query(or_=query) .values_dict('id')[:WIKI_RESULTS]) for r in raw_results: try: doc = (Document.objects.select_related('current_revision') .get(pk=r['id'])) results.append({ 'search_summary': clean_excerpt( doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Note: Questions app is en-US only. query = dict(('%s__text' % field, text) for field in Question.get_query_fields()) raw_results = (question_s.query(or_=query) .values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q, 'is_solved': q.is_solved, 'num_answers': q.num_answers, 'num_votes': q.num_votes, 'num_votes_past_week': q.num_votes_past_week }) except Question.DoesNotExist: pass except (ESTimeoutError, ESMaxRetryError, ESException) as exc: if isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.timeouterror') elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.maxretryerror') elif isinstance(exc, ESException): statsd.incr('questions.suggestions.elasticsearchexception') log.debug(exc) return results
def _search_suggestions(request, query, locale, tags, product_slugs): """Return an iterable of the most relevant wiki pages and questions. query -- full text to search on locale -- locale to limit to tags -- list of tags to filter questions on product_slugs -- list of product slugs to filter articles on (["desktop", "mobile", ...]) Items are dicts of: { 'type': 'search_summary': 'title': 'url': 'object': } Returns up to 3 wiki pages, then up to 3 questions. """ # TODO: this can be reworked to pull data from ES rather than # hit the db. question_s = Question.search() wiki_s = Document.search() # Max number of search results per type. WIKI_RESULTS = QUESTIONS_RESULTS = 3 default_categories = settings.SEARCH_DEFAULT_CATEGORIES # Apply product filters if product_slugs: wiki_s = wiki_s.filter(document_product__in=product_slugs) if tags: question_s = question_s.filter(question_tag__in=tags) try: raw_results = (wiki_s.filter( document_locale=locale, document_category__in=default_categories).query(query).values_dict( 'id')[:WIKI_RESULTS]) results = [] for r in raw_results: try: doc = (Document.objects.select_related('current_revision').get( pk=r['id'])) results.append({ 'search_summary': clean_excerpt(doc.current_revision.summary), 'url': doc.get_absolute_url(), 'title': doc.title, 'type': 'document', 'object': doc, }) except Document.DoesNotExist: pass # Note: Questions app is en-US only. raw_results = ( question_s.query(query).values_dict('id')[:QUESTIONS_RESULTS]) for r in raw_results: try: q = Question.objects.get(pk=r['id']) results.append({ 'search_summary': clean_excerpt(q.content[0:500]), 'url': q.get_absolute_url(), 'title': q.title, 'type': 'question', 'object': q, 'is_solved': q.is_solved, 'num_answers': q.num_answers, 'num_votes': q.num_votes, 'num_votes_past_week': q.num_votes_past_week }) except Question.DoesNotExist: pass except (ESTimeoutError, ESMaxRetryError, ESException), exc: if isinstance(exc, ESTimeoutError): statsd.incr('questions.suggestions.timeouterror') elif isinstance(exc, ESMaxRetryError): statsd.incr('questions.suggestions.maxretryerror') elif isinstance(exc, ESException): statsd.incr('questions.suggestions.elasticsearchexception') return []
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if not 'href' in attrs: continue href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] # Prepare annotations record for this path. links[href] = dict(classes=[]) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = urllib.unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = ( Document.locale_and_slug_from_path( href_path, path_locale=href_locale)) # Does this locale and slug correspond to an existing document? # If not, mark it as a "new" link. # # TODO: Should these DB queries be batched up into one big # query? A page with hundreds of links will fire off hundreds # of queries ct = Document.objects.filter(locale=locale, slug=slug).count() if ct == 0: links[href]['classes'].append('new') # Pass #2: Filter the content, annotating links for token in buffer: if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if 'href' in attrs: href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] if href in links: # Update class names on this link element. if 'class' in attrs: classes = set(attrs['class'].split(u' ')) else: classes = set() classes.update(links[href]['classes']) if classes: attrs['class'] = u' '.join(classes) token['data'] = attrs.items() yield token
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if not 'href' in attrs: continue href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] # Prepare annotations record for this path. links[href] = dict(classes=[]) needs_existence_check = defaultdict(lambda: defaultdict(set)) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = urllib.unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = ( Document.locale_and_slug_from_path( href_path, path_locale=href_locale)) # Gather up this link for existence check needs_existence_check[locale.lower()][slug.lower()].add(href) # Perform existence checks for all the links, using one DB query per # locale for all the candidate slugs. for locale, slug_hrefs in needs_existence_check.items(): existing_slugs = (Document.objects.filter( locale=locale, slug__in=slug_hrefs.keys()).values_list('slug', flat=True)) # Remove the slugs that pass existence check. for slug in existing_slugs: lslug = slug.lower() if lslug in slug_hrefs: del slug_hrefs[lslug] # Mark all the links whose slugs did not come back from the DB # query as "new" for slug, hrefs in slug_hrefs.items(): for href in hrefs: links[href]['classes'].append('new') # Pass #2: Filter the content, annotating links for token in buffer: if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if 'href' in attrs: href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] if href in links: # Update class names on this link element. if 'class' in attrs: classes = set(attrs['class'].split(u' ')) else: classes = set() classes.update(links[href]['classes']) if classes: attrs['class'] = u' '.join(classes) token['data'] = attrs.items() yield token
def reindex_kb(): """Reindex wiki_document.""" index_task.delay(Document, Document.get_indexable())
class UntranslatedReadout(Readout): title = _lazy(u'Untranslated') description = _lazy( u'This indicates there are no approved translations of these articles. ' 'Some of the articles may have proposed translations waiting to be ' 'reviewed and will appear in the Unreviewed Changes section as well.') short_title = _lazy(u'Untranslated') details_link_text = _lazy(u'All untranslated articles...') slug = 'untranslated' column4_label = _lazy(u'Updated') def _query_and_params(self, max): # Filter by product if specified. if self.product: extra_joins = PRODUCT_FILTER params = (self.locale, LAST_30_DAYS, self.product.id, settings.WIKI_DEFAULT_LANGUAGE) else: extra_joins = '' params = (self.locale, LAST_30_DAYS, settings.WIKI_DEFAULT_LANGUAGE) # Incidentally, we tried this both as a left join and as a search # against an inner query returning translated docs, and the left join # yielded a faster-looking plan (on a production corpus). # # Find non-archived, localizable documents in categories 10, # 20 and 60 having at least one ready- for-localization # revision. Of those, show the ones that have no translation. query = ( 'SELECT engdoc.slug, engdoc.title, ' 'wiki_revision.reviewed, dashboards_wikidocumentvisits.visits ' 'FROM wiki_document engdoc ' 'INNER JOIN wiki_revision ON ' 'engdoc.latest_localizable_revision_id=wiki_revision.id ' 'LEFT JOIN wiki_document translated ON ' 'engdoc.id=translated.parent_id AND translated.locale=%s ' 'LEFT JOIN dashboards_wikidocumentvisits ON ' 'engdoc.id=dashboards_wikidocumentvisits.document_id AND ' 'dashboards_wikidocumentvisits.period=%s ' + extra_joins + 'WHERE ' '(translated.id IS NULL OR translated.current_revision_id IS NULL) ' 'AND engdoc.is_localizable AND ' 'engdoc.category in (10, 20, 60) AND ' 'engdoc.locale=%s AND NOT engdoc.is_archived ' 'AND wiki_revision.content NOT LIKE "REDIRECT%%" ' + self._order_clause() + self._limit_clause(max)) return query, params def _order_clause(self): return ('ORDER BY wiki_revision.reviewed DESC, engdoc.title ASC' if self.mode == MOST_RECENT else 'ORDER BY dashboards_wikidocumentvisits.visits DESC, ' 'engdoc.title ASC') def _format_row(self, (slug, title, reviewed, visits)): # Run the data through the model to (potentially) format it and # take advantage of SPOTs (like for get_absolute_url()): d = Document(slug=slug, title=title, locale=settings.WIKI_DEFAULT_LANGUAGE) return dict(title=d.title, url=d.get_absolute_url(), visits=visits, updated=reviewed)
def search(request, template=None): """ES-specific search view""" if (waffle.flag_is_active(request, 'esunified') or request.GET.get('esunified')): return search_with_es_unified(request, template) start = time.time() # JSON-specific variables is_json = (request.GET.get('format') == 'json') callback = request.GET.get('callback', '').strip() mimetype = 'application/x-javascript' if callback else 'application/json' # Search "Expires" header format expires_fmt = '%A, %d %B %Y %H:%M:%S GMT' # Check callback is valid if is_json and callback and not jsonp_is_valid(callback): return HttpResponse( json.dumps({'error': _('Invalid callback function.')}), mimetype=mimetype, status=400) language = locale_or_default(request.GET.get('language', request.locale)) r = request.GET.copy() a = request.GET.get('a', '0') # Search default values try: category = (map(int, r.getlist('category')) or settings.SEARCH_DEFAULT_CATEGORIES) except ValueError: category = settings.SEARCH_DEFAULT_CATEGORIES r.setlist('category', category) # Basic form if a == '0': r['w'] = r.get('w', constants.WHERE_BASIC) # Advanced form if a == '2': r['language'] = language r['a'] = '1' # TODO: Rewrite so SearchForm is unbound initially and we can use # `initial` on the form fields. if 'include_archived' not in r: r['include_archived'] = False search_form = SearchForm(r) if not search_form.is_valid() or a == '2': if is_json: return HttpResponse( json.dumps({'error': _('Invalid search data.')}), mimetype=mimetype, status=400) t = template if request.MOBILE else 'search/form.html' search_ = jingo.render(request, t, {'advanced': a, 'request': request, 'search_form': search_form}) search_['Cache-Control'] = 'max-age=%s' % \ (settings.SEARCH_CACHE_PERIOD * 60) search_['Expires'] = (datetime.utcnow() + timedelta( minutes=settings.SEARCH_CACHE_PERIOD)) \ .strftime(expires_fmt) return search_ cleaned = search_form.cleaned_data page = max(smart_int(request.GET.get('page')), 1) offset = (page - 1) * settings.SEARCH_RESULTS_PER_PAGE lang = language.lower() if settings.LANGUAGES.get(lang): lang_name = settings.LANGUAGES[lang] else: lang_name = '' wiki_s = Document.search() question_s = Question.search() discussion_s = Thread.search() # wiki filters # Category filter if cleaned['category']: wiki_s = wiki_s.filter(document_category__in=cleaned['category']) # Locale filter wiki_s = wiki_s.filter(document_locale=language) # Product filter products = cleaned['product'] for p in products: wiki_s = wiki_s.filter(document_tag=p) # Tags filter tags = [t.strip() for t in cleaned['tags'].split()] for t in tags: wiki_s = wiki_s.filter(document_tag=t) # Archived bit if a == '0' and not cleaned['include_archived']: # Default to NO for basic search: cleaned['include_archived'] = False if not cleaned['include_archived']: wiki_s = wiki_s.filter(document_is_archived=False) # End of wiki filters # Support questions specific filters if cleaned['w'] & constants.WHERE_SUPPORT: # Solved is set by default if using basic search if a == '0' and not cleaned['has_helpful']: cleaned['has_helpful'] = constants.TERNARY_YES # These filters are ternary, they can be either YES, NO, or OFF ternary_filters = ('is_locked', 'is_solved', 'has_answers', 'has_helpful') d = dict(('question_%s' % filter_name, _ternary_filter(cleaned[filter_name])) for filter_name in ternary_filters if cleaned[filter_name]) if d: question_s = question_s.filter(**d) if cleaned['asked_by']: question_s = question_s.filter( question_creator=cleaned['asked_by']) if cleaned['answered_by']: question_s = question_s.filter( question_answer_creator=cleaned['answered_by']) q_tags = [t.strip() for t in cleaned['q_tags'].split(',')] for t in q_tags: if t: question_s = question_s.filter(question_tag=t) # Discussion forum specific filters if cleaned['w'] & constants.WHERE_DISCUSSION: if cleaned['author']: discussion_s = discussion_s.filter( post_author_ord=cleaned['author']) if cleaned['thread_type']: if constants.DISCUSSION_STICKY in cleaned['thread_type']: discussion_s = discussion_s.filter(post_is_sticky=1) if constants.DISCUSSION_LOCKED in cleaned['thread_type']: discussion_s = discussion_s.filter(post_is_locked=1) if cleaned['forum']: discussion_s = discussion_s.filter( post_forum_id__in=cleaned['forum']) # Filters common to support and discussion forums # Created filter unix_now = int(time.time()) interval_filters = ( ('created', cleaned['created'], cleaned['created_date']), ('updated', cleaned['updated'], cleaned['updated_date'])) for filter_name, filter_option, filter_date in interval_filters: if filter_option == constants.INTERVAL_BEFORE: before = {filter_name + '__gte': 0, filter_name + '__lte': max(filter_date, 0)} discussion_s = discussion_s.filter(**before) question_s = question_s.filter(**before) elif filter_option == constants.INTERVAL_AFTER: after = {filter_name + '__gte': min(filter_date, unix_now), filter_name + '__lte': unix_now} discussion_s = discussion_s.filter(**after) question_s = question_s.filter(**after) # Note: num_voted (with a d) is a different field than num_votes # (with an s). The former is a dropdown and the latter is an # integer value. if cleaned['num_voted'] == constants.INTERVAL_BEFORE: question_s = question_s.filter( question_num_votes__lte=max(cleaned['num_votes'], 0)) elif cleaned['num_voted'] == constants.INTERVAL_AFTER: question_s = question_s.filter( question_num_votes__gte=cleaned['num_votes']) # Done with all the filtery stuff--time to generate results documents = ComposedList() sortby = smart_int(request.GET.get('sortby')) try: max_results = settings.SEARCH_MAX_RESULTS cleaned_q = cleaned['q'] if cleaned['w'] & constants.WHERE_WIKI: if cleaned_q: wiki_s = wiki_s.query(cleaned_q) # For a front-page non-advanced search, we want to cap the kb # at 10 results. if a == '0': wiki_max_results = 10 else: wiki_max_results = max_results documents.set_count(('wiki', wiki_s), min(wiki_s.count(), wiki_max_results)) if cleaned['w'] & constants.WHERE_SUPPORT: # Sort results by try: question_s = question_s.order_by( *constants.SORT_QUESTIONS[sortby]) except IndexError: pass question_s = question_s.highlight( 'question_title', 'question_content', 'question_answer_content', before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) if cleaned_q: question_s = question_s.query(cleaned_q) documents.set_count(('question', question_s), min(question_s.count(), max_results)) if cleaned['w'] & constants.WHERE_DISCUSSION: discussion_s = discussion_s.highlight( 'discussion_content', before_match='<b>', after_match='</b>', limit=settings.SEARCH_SUMMARY_LENGTH) if cleaned_q: discussion_s = discussion_s.query(cleaned_q) documents.set_count(('forum', discussion_s), min(discussion_s.count(), max_results)) results_per_page = settings.SEARCH_RESULTS_PER_PAGE pages = paginate(request, documents, results_per_page) num_results = len(documents) # Get the documents we want to show and add them to # docs_for_page. documents = documents[offset:offset + results_per_page] docs_for_page = [] for (kind, search_s), bounds in documents: search_s = search_s.values_dict()[bounds[0]:bounds[1]] docs_for_page += [(kind, doc) for doc in search_s] results = [] for i, docinfo in enumerate(docs_for_page): rank = i + offset # Type here is something like 'wiki', ... while doc here # is an ES result document. type_, doc = docinfo if type_ == 'wiki': summary = doc['document_summary'] result = { 'url': doc['url'], 'title': doc['document_title'], 'type': 'document', 'object': ObjectDict(doc)} elif type_ == 'question': summary = _build_es_excerpt(doc) result = { 'url': doc['url'], 'title': doc['question_title'], 'type': 'question', 'object': ObjectDict(doc), 'is_solved': doc['question_is_solved'], 'num_answers': doc['question_num_answers'], 'num_votes': doc['question_num_votes'], 'num_votes_past_week': doc['question_num_votes_past_week']} else: summary = _build_es_excerpt(doc) result = { 'url': doc['url'], 'title': doc['post_title'], 'type': 'thread', 'object': ObjectDict(doc)} result['search_summary'] = summary result['rank'] = rank result['score'] = doc._score results.append(result) except (ESTimeoutError, ESMaxRetryError, ESException), exc: # Handle timeout and all those other transient errors with a # "Search Unavailable" rather than a Django error page. if is_json: return HttpResponse(json.dumps({'error': _('Search Unavailable')}), mimetype=mimetype, status=503) if isinstance(exc, ESTimeoutError): statsd.incr('search.es.timeouterror') elif isinstance(exc, ESMaxRetryError): statsd.incr('search.es.maxretryerror') elif isinstance(exc, ESException): statsd.incr('search.es.elasticsearchexception') t = 'search/mobile/down.html' if request.MOBILE else 'search/down.html' return jingo.render(request, t, {'q': cleaned['q']}, status=503)
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if "StartTag" == token["type"] and "a" == token["name"]: attrs = dict(token["data"]) if not "href" in attrs: continue href = attrs["href"] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = "/%s" % href[len(self.base_url) :] # Prepare annotations record for this path. links[href] = dict(classes=[]) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]["classes"].append("external") continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if "/docs/" in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if "/docs/%s" % path in href: skip = True if skip: continue href_locale, href_path = href.split(u"/docs/", 1) if href_locale.startswith(u"/"): href_locale = href_locale[1:] if "#" in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition("#") # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode("utf-8", "ignore") href_path = urllib.unquote(href_path) href_path = href_path.decode("utf-8", "ignore") # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = Document.locale_and_slug_from_path(href_path, path_locale=href_locale) # Does this locale and slug correspond to an existing document? # If not, mark it as a "new" link. # # TODO: Should these DB queries be batched up into one big # query? A page with hundreds of links will fire off hundreds # of queries ct = Document.objects.filter(locale=locale, slug=slug).count() if ct == 0: links[href]["classes"].append("new") # Pass #2: Filter the content, annotating links for token in buffer: if "StartTag" == token["type"] and "a" == token["name"]: attrs = dict(token["data"]) if "href" in attrs: href = attrs["href"] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = "/%s" % href[len(self.base_url) :] if href in links: # Update class names on this link element. if "class" in attrs: classes = set(attrs["class"].split(u" ")) else: classes = set() classes.update(links[href]["classes"]) if classes: attrs["class"] = u" ".join(classes) token["data"] = attrs.items() yield token