def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if "StartTag" == token["type"] and "a" == token["name"]: attrs = dict(token["data"]) if not "href" in attrs: continue href = attrs["href"] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = "/%s" % href[len(self.base_url) :] # Prepare annotations record for this path. links[href] = dict(classes=[]) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]["classes"].append("external") continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if "/docs/" in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if "/docs/%s" % path in href: skip = True if skip: continue href_locale, href_path = href.split(u"/docs/", 1) if href_locale.startswith(u"/"): href_locale = href_locale[1:] if "#" in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition("#") # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode("utf-8", "ignore") href_path = urllib.unquote(href_path) href_path = href_path.decode("utf-8", "ignore") # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = Document.locale_and_slug_from_path(href_path, path_locale=href_locale) # Does this locale and slug correspond to an existing document? # If not, mark it as a "new" link. # # TODO: Should these DB queries be batched up into one big # query? A page with hundreds of links will fire off hundreds # of queries ct = Document.objects.filter(locale=locale, slug=slug).count() if ct == 0: links[href]["classes"].append("new") # Pass #2: Filter the content, annotating links for token in buffer: if "StartTag" == token["type"] and "a" == token["name"]: attrs = dict(token["data"]) if "href" in attrs: href = attrs["href"] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = "/%s" % href[len(self.base_url) :] if href in links: # Update class names on this link element. if "class" in attrs: classes = set(attrs["class"].split(u" ")) else: classes = set() classes.update(links[href]["classes"]) if classes: attrs["class"] = u" ".join(classes) token["data"] = attrs.items() yield token
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if not 'href' in attrs: continue href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] # Prepare annotations record for this path. links[href] = dict(classes=[]) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = urllib.unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = ( Document.locale_and_slug_from_path( href_path, path_locale=href_locale)) # Does this locale and slug correspond to an existing document? # If not, mark it as a "new" link. # # TODO: Should these DB queries be batched up into one big # query? A page with hundreds of links will fire off hundreds # of queries ct = Document.objects.filter(locale=locale, slug=slug).count() if ct == 0: links[href]['classes'].append('new') # Pass #2: Filter the content, annotating links for token in buffer: if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if 'href' in attrs: href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] if href in links: # Update class names on this link element. if 'class' in attrs: classes = set(attrs['class'].split(u' ')) else: classes = set() classes.update(links[href]['classes']) if classes: attrs['class'] = u' '.join(classes) token['data'] = attrs.items() yield token
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if not 'href' in attrs: continue href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] # Prepare annotations record for this path. links[href] = dict(classes=[]) needs_existence_check = defaultdict(lambda: defaultdict(set)) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = urllib.unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = ( Document.locale_and_slug_from_path( href_path, path_locale=href_locale)) # Gather up this link for existence check needs_existence_check[locale.lower()][slug.lower()].add(href) # Perform existence checks for all the links, using one DB query per # locale for all the candidate slugs. for locale, slug_hrefs in needs_existence_check.items(): existing_slugs = (Document.objects.filter( locale=locale, slug__in=slug_hrefs.keys()).values_list('slug', flat=True)) # Remove the slugs that pass existence check. for slug in existing_slugs: lslug = slug.lower() if lslug in slug_hrefs: del slug_hrefs[lslug] # Mark all the links whose slugs did not come back from the DB # query as "new" for slug, hrefs in slug_hrefs.items(): for href in hrefs: links[href]['classes'].append('new') # Pass #2: Filter the content, annotating links for token in buffer: if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if 'href' in attrs: href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] if href in links: # Update class names on this link element. if 'class' in attrs: classes = set(attrs['class'].split(u' ')) else: classes = set() classes.update(links[href]['classes']) if classes: attrs['class'] = u' '.join(classes) token['data'] = attrs.items() yield token