def _early_process_entry(self, page_generator, entry): """Perform early in-place processing of an entry.""" entryDocument = minidom.parseString(entry['atom:entry']) entryElement = entryDocument.documentElement page_content_type = self._framework.plugins['vars'].vars['page_content_type'] # Extract the 'id' of the entry (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id') entry['id'] = getChildText(idElement).strip() # Extract and normalize the 'published' date of the entry (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published') entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip()) # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist. ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated')) if ee: (updatedElement,) = ee # there should be only one else: # Create an <updated> element using the 'published' date updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated') replaceChildText(updatedElement, entry['published']) entryElement.appendChild(updatedElement) entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip()) # Create a <title> element if one does not already exist. ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title')) if not ee: titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title') titleElement.setAttribute('type', 'text') titleElement.appendChild(entryDocument.createTextNode(entry['title'])) entryElement.appendChild(titleElement) # Create a <link rel="alternate"> element if one does not already exist. ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link') linkElement = None for e in ee: rel = e.getAttribute('rel') type = e.getAttribute('type') hreflang = e.getAttribute('hreflang') if rel == "alternate" and type == page_content_type and not hreflang: if linkElement is not None: raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % ( page_content_type, hreflang, page_generator.path_info.source_filename,)) linkElement = e if not linkElement: linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link') linkElement.setAttribute('rel', 'alternate') linkElement.setAttribute('href', page_generator.path_info.target_url) linkElement.setAttribute('type', page_content_type) entryElement.appendChild(linkElement) # Rewrite URLs in the atom:entry element rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True) # Add a <summary> element, if applicable if entry['summary']: summaryDocument = minidom.parseString(entry['summary']) # Rewrite URLs in the summary rewrite_links(summaryDocument.documentElement, HTML_CRITERIA, entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True) # Create Atom <summary> element summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary') summaryElement.setAttribute('type', 'xhtml') entryElement.appendChild(summaryElement) # Create XHTML <div> element divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div') divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE) summaryElement.appendChild(divElement) # Add data for n in summaryDocument.documentElement.childNodes: divElement.appendChild(divElement.ownerDocument.importNode(n, True)) # Elements with no namespace become XHTML elements substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE}) # Clean up data = None summaryDocument.unlink() summaryDocument = None del entry['summary'] # Add a <content> element if True: bodyDocument = minidom.parseString(entry['body']) # Rewrite URLs in the body rewrite_links(bodyDocument.documentElement, HTML_CRITERIA, entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True) # Create Atom <content> element contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content') contentElement.setAttribute('type', 'xhtml') entryElement.appendChild(contentElement) # Create XHTML <div> element divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div') divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE) contentElement.appendChild(divElement) # Add data for n in bodyDocument.documentElement.childNodes: divElement.appendChild(divElement.ownerDocument.importNode(n, True)) # Elements with no namespace become XHTML elements substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE}) # Clean up data = None bodyDocument.unlink() bodyDocument = None del entry['body'] # Perform xmlns normalization normalize_namespaces(entryDocument.documentElement, strip_dups=True) # Update the new atom:entry document entry['atom:entry'] = entryDocument.toxml()
def _filter_rewrite_links(pg): rewrite_links(pg.page, HTML_CRITERIA, target_url=pg.path_info.target_url, base_url=pg.path_info.base_url)
def handle_make_atom_feed(self, target_url): """Write the data we've collected so far as an Atom feed Usage: make-atom-feed TARGET_RELATIVE_URL """ tp = TypicalPaths(self._framework, target_url) data_dir = self._get_feed_data_dir() def is_update_needed(): # Check if the feed needs to be updated try: output_mtime = os.lstat(tp.output_filename).st_mtime except EnvironmentError as exc: if exc.errno != errno.ENOENT: raise # The output file doesn't exist, so an update is needed return True # Output file exists. Check timestamps. source_mtime = os.lstat(tp.source_filename).st_mtime if output_mtime < source_mtime: # The source file was modified, so an update is needed. return True for basename in fnmatch.filter(os.listdir(data_dir), "entry-*-stamp"): entry_mtime = os.lstat(os.path.join(data_dir, basename)).st_mtime if output_mtime < entry_mtime: # one of the entries is newer than the output file, so an update is needed return True return False if not is_update_needed(): # No update needed print("skipping %s" % (tp.output_filename,)) return # Make sure the output directory exists self._framework.plugins['StillWeb.BasicCommands'].ensure_path(tp.output_dir, tp.pathtuple[:-1]) print("making %s (using %s)" % (tp.output_filename, tp.source_filename)) # Load the entries entries = [] regex = re.compile(r"^entry-([^-.]*)-stamp$") for basename in os.listdir(data_dir): m = regex.search(basename) if not m: continue rootword = m.group(1) filename = os.path.join(data_dir, "entry-%s-data" % (rootword,)) try: f = open(filename, "rb") entry = pickle.load(f) f.close() except EnvironmentError as exc: if exc.errno == errno.ENOENT: continue else: raise entries.append(entry) # Check for duplicate ids ids = {} for entry in entries: if entry['id'] in ids: raise FGValueError("Duplicate id %r in %s (already defined in %s)" % (entry['id'], entry['path_info'].source_filename, ids[entry['id']])) ids[entry['id']] = entry['path_info'].source_filename # Skip entries whose publication dates are in the future unpublished_entries = [] now = atom_datetime_to_utc(datetime.datetime.utcnow().isoformat() + "Z") for i, entry in enumerate(entries): if entry['published'] > now: unpublished_entries.append(i) for i in reversed(unpublished_entries): print("%s: skipping %s ('published' in the future)" % (tp.output_filename, entry['path_info'].output_filename,)) del entries[i] # Sort the entries by their publication date, newest first. entries.sort(key=lambda entry: atom_datetime_to_sort_key(entry['published']), reverse=True) # Find the most recent update if not entries: raise FGValueError("Refusing to make empty feed") most_recent_update = entries[0]['updated'] # Load and parse the template file feedDocument = minidom.parseString(open(tp.source_filename, "rb").read()) feedElement = feedDocument.documentElement assert (feedElement.namespaceURI, feedElement.localName) == (ATOM_NAMESPACE, "feed") # Set <updated> to the newest entry's <updated> (or <published>) field if tuple(getChildElementsNS(feedDocument, ATOM_NAMESPACE, "updated")): raise FGValueError("Template contains auto-generated <updated> field") updatedElement = feedDocument.createElementNS(ATOM_NAMESPACE, 'updated') feedElement.appendChild(updatedElement) replaceChildText(updatedElement, most_recent_update) # Create a <link rel="self"> element if one does not already exist. for linkElement in getChildElementsNS(feedElement, ATOM_NAMESPACE, "link"): if linkElement.getAttribute('rel') == 'self': break else: linkElement = feedDocument.createElementNS(ATOM_NAMESPACE, 'link') linkElement.setAttribute('rel', 'self') linkElement.setAttribute('type', ATOM_CONTENT_TYPE) linkElement.setAttribute('href', tp.target_url) feedElement.appendChild(linkElement) # Do URL path substitution rewrite_links(feedElement, ATOM_CRITERIA, tp.target_url, tp.base_url, always_absolute=True) # Add the entries for entry in entries: # Create an <entry> element entryElement = feedDocument.importNode(minidom.parseString(entry['atom:entry']).documentElement, True) assert (entryElement.namespaceURI, entryElement.localName) == (ATOM_NAMESPACE, 'entry') feedElement.appendChild(entryElement) # Write the feed to the output file if os.path.exists(tp.output_filename): os.unlink(tp.output_filename) output_file = open(tp.output_filename, "wb") try: output_file.write(feedDocument.toxml("UTF-8")) except: os.unlink(tp.output_filename) raise finally: output_file.close()