def build_sitemaps(): sitemap_element = "<sitemap><loc>%s</loc><lastmod>%s</lastmod></sitemap>" sitemap_index = ("<sitemapindex xmlns=\"http://www.sitemaps.org/" "schemas/sitemap/0.9\">") for locale in settings.MDN_LANGUAGES: queryset = (Document.objects.filter( is_template=False, locale=locale, is_redirect=False).exclude( title__startswith='User:'******'Talk:')) if len(queryset) > 0: info = {'queryset': queryset, 'date_field': 'modified'} sitemap = GenericSitemap(info, priority=0.5) urls = sitemap.get_urls(page=1) xml = smart_str( loader.render_to_string('wiki/sitemap.xml', {'urlset': urls})) xml = xml.replace('http://developer.mozilla.org', 'https://developer.mozilla.org') directory = '%s/sitemaps/%s' % (settings.MEDIA_ROOT, locale) if not os.path.exists(directory): os.makedirs(directory) f = open('%s/sitemap.xml' % directory, 'w') f.write(xml) f.close() sitemap_url = ("https://%s/sitemaps/%s/sitemap.xml" % (Site.objects.get_current().domain, locale)) sitemap_index = sitemap_index + sitemap_element % ( sitemap_url, time.strftime('%Y-%m-%dT%H:%M:%S+00:00', time.gmtime())) sitemap_index = sitemap_index + "</sitemapindex>" index_file = open('%s/sitemap.xml' % settings.MEDIA_ROOT, 'w') index_file.write(parseString(sitemap_index).toxml()) index_file.close()
def build_sitemaps(): sitemap_element = "<sitemap><loc>%s</loc><lastmod>%s</lastmod></sitemap>" sitemap_index = "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" for locale in settings.MDN_LANGUAGES: queryset = (Document.objects .filter(is_template=False, locale=locale) .exclude(title__startswith='User:'******'Redirect [0-9]+$') .exclude(html__iregex=r'^(<p>)?(#)?REDIRECT') .exclude(slug__icontains='Talk:') ) if len(queryset) > 0: info = {'queryset': queryset, 'date_field': 'modified'} sitemap = GenericSitemap(info, priority=0.5) urls = sitemap.get_urls(page=1) xml = smart_str(loader.render_to_string('sitemap.xml', {'urlset': urls})) xml = xml.replace('http://', 'https://') directory = '%s/sitemaps/%s' % (settings.MEDIA_ROOT, locale) if not os.path.exists(directory): os.makedirs(directory) f = open('%s/sitemap.xml' % directory, 'w') f.write(xml) f.close() sitemap_url = ("https://%s/sitemaps/%s/sitemap.xml" % ( Site.objects.get_current().domain, locale)) sitemap_index = sitemap_index + sitemap_element % (sitemap_url, time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime())) sitemap_index = sitemap_index + "</sitemapindex>" index_file = open('%s/sitemap.xml' % settings.MEDIA_ROOT, 'w') index_file.write(parseString(sitemap_index).toxml()) index_file.close()
def test_sitemap_item(self): """ Check to make sure that the raw item is included with each Sitemap.get_url() url result. """ user_sitemap = GenericSitemap({'queryset': User.objects.all()}) def is_user(url): return isinstance(url['item'], User) item_in_url_info = all(map(is_user, user_sitemap.get_urls())) self.assertTrue(item_in_url_info)
def test_sitemap_item(self): """ Check to make sure that the raw item is included with each Sitemap.get_url() url result. """ test_sitemap = GenericSitemap({'queryset': TestModel.objects.all()}) def is_testmodel(url): return isinstance(url['item'], TestModel) item_in_url_info = all(map(is_testmodel, test_sitemap.get_urls())) self.assertTrue(item_in_url_info)
def test_sitemap_item(self): """ Check to make sure that the raw item is included with each Sitemap.get_url() url result. """ test_sitemap = GenericSitemap({'queryset': TestModel.objects.order_by('pk').all()}) def is_testmodel(url): return isinstance(url['item'], TestModel) item_in_url_info = all(map(is_testmodel, test_sitemap.get_urls())) self.assertTrue(item_in_url_info)
def generate(): sitemap = GenericSitemap({'queryset': models.Post.objects.filter(type__in=const.POST_TOPLEVEL).exclude(type=const.POST_BLOG), }) urlset = sitemap.get_urls() text = loader.render_to_string('sitemap.xml', {'urlset': urlset}) text = smart_str(text) site = Site.objects.get_current() fname = path(settings.EXPORT_DIR, 'sitemap.xml') print '*** writing sitemap for %s to %s' % (site, fname) fp = open(fname, 'wt') fp.write(text) fp.close() print '*** done'
def generate_sitemap(): sitemap = GenericSitemap({ 'queryset': Post.objects.filter(type__in=Post.TOP_LEVEL).exclude(type=Post.BLOG), }) urlset = sitemap.get_urls() text = loader.render_to_string('sitemap.xml', {'urlset': urlset}) text = smart_str(text) site = Site.objects.get_current() fname = path(settings.STATIC_ROOT, 'sitemap.xml') logger.info('*** writing sitemap for %s to %s' % (site, fname)) fp = open(fname, 'wt') fp.write(text) fp.close() logger.info('*** done')
def generate_sitemap(): sitemap = GenericSitemap({ 'queryset': Post.objects.filter(type__in=Post.TOP_LEVEL).exclude(type=Post.BLOG), }) urlset = sitemap.get_urls() text = loader.render_to_string('sitemap.xml', {'urlset': urlset}) text = smart_str(text) site = Site.objects.get_current() fname = path(settings.STATIC_ROOT, 'sitemap.xml') logger.info('*** writing sitemap for %s to %s' % (site, fname)) fp = open(fname, 'wt') fp.write(text) fp.close() logger.info('*** done')
def build_locale_sitemap(locale): """ For the given locale build the appropriate sitemap file and returns the locale, the file names written and timestamp of the build. """ now = datetime.utcnow() timestamp = "%s+00:00" % now.replace(microsecond=0).isoformat() directory = os.path.join(settings.MEDIA_ROOT, "sitemaps", locale) if not os.path.isdir(directory): os.makedirs(directory) # Add any non-document URL's, which will always include the home page. other_urls = [ { "location": absolutify(reverse("home", locale=locale)), "lastmod": None, "changefreq": None, "priority": None, } ] make = [("sitemap_other.xml", other_urls)] # We *could* use the `Document.objects.filter_for_list()` manager # but it has a list of `.only()` columns which isn't right, # it has a list of hardcoded slug prefixes, and it forces an order by # on 'slug' which is slow and not needed in this context. queryset = Document.objects.filter(locale=locale, is_redirect=False,).exclude( html="" ) # Be explicit about exactly only the columns we need. queryset = queryset.only("id", "locale", "slug", "modified") # The logic for rendering a page will do various checks on each # document to evaluate if it should be excluded from robots. # Ie. in a jinja template it does... # `{% if reasons... %}noindex, nofollow{% endif %}` # Some of those evaluations are complex and depend on the request. # That's too complex here but we can at least do some low-hanging # fruit filtering. queryset = queryset.exclude(current_revision__isnull=True,) q = Q(slug__startswith=EXPERIMENT_TITLE_PREFIX) for legacy_mindtouch_namespace in LEGACY_MINDTOUCH_NAMESPACES: q |= Q(slug__startswith="{}:".format(legacy_mindtouch_namespace)) for slug_start in NOINDEX_SLUG_PREFIXES: q |= Q(slug__startswith=slug_start) queryset = queryset.exclude(q) # We have to make the queryset ordered. Otherwise the GenericSitemap # generator might throw this perfectly valid warning: # # UnorderedObjectListWarning: # Pagination may yield inconsistent results with an unordered # object_list: <class 'kuma.wiki.models.Document'> QuerySet. # # Any order is fine. Use something definitely indexed. It's needed for # paginator used by GenericSitemap. queryset = queryset.order_by("id") # To avoid an extra query to see if the queryset is empty, let's just # start iterator and create the sitemap on the first found page. # Note, how we check if 'urls' became truthy before adding it. sitemap = GenericSitemap( {"queryset": queryset, "date_field": "modified"}, protocol="https", priority=0.5 ) for page in range(1, sitemap.paginator.num_pages + 1): urls = sitemap.get_urls(page=page) if page == 1: name = "sitemap.xml" else: name = "sitemap_%s.xml" % page if urls: make.append((name, urls)) # Make the sitemap files. for name, urls in make: rendered = smart_str(render_to_string("wiki/sitemap.xml", {"urls": urls})) path = os.path.join(directory, name) with open(path, "w") as sitemap_file: sitemap_file.write(rendered) return locale, [name for name, _ in make], timestamp