def create_html_index(index_content): """ Creates an HTML index (mainly to navigate through the exported pages). :param index_content: Dictionary which contains file paths, page titles and their children recursively. :returns: Content index as HTML. """ file_path = utils.encode_url(index_content['file_path']) page_title = index_content['page_title'] page_children = index_content['child_pages'] html_content = '<a href="%s">%s</a>' % (utils.sanitize_for_filename(file_path), page_title) if len(page_children) > 0: html_content += '<ul>\n' for child in page_children: html_content += '\t<li>%s</li>\n' % create_html_index(child) html_content += '</ul>\n' return html_content
def provide_unique_file_name(duplicate_file_names, file_matching, file_title, is_folder=False, explicit_file_extension=None): """ Provides an unique AND sanitized file name for a given page title. Confluence does not allow the same page title in one particular space but collisions are possible after filesystem sanitization. :param duplicate_file_names: A dict in the structure {'<sanitized filename>': amount of duplicates} :param file_matching: A dict in the structure {'<file title>': '<used offline filename>'} :param file_title: File title which is used to generate the unique file name :param is_folder: (optional) Flag which states whether the file is a folder :param explicit_file_extension: (optional) Explicitly set file extension (e.g. 'html') """ if file_title in file_matching: file_name = file_matching[file_title] else: file_name = utils.sanitize_for_filename(file_title) if is_folder: file_extension = None elif explicit_file_extension: file_extension = explicit_file_extension else: if '.' in file_name: file_name, file_extension = file_name.rsplit('.', 1) else: file_extension = None if file_name in duplicate_file_names: duplicate_file_names[file_name] += 1 file_name = '%s_%d' % (file_name, duplicate_file_names[file_name]) else: duplicate_file_names[file_name] = 0 file_name = file_name if file_extension: file_name += '.%s' % file_extension file_matching[file_title] = file_name return file_name
def fetch_page_recursively(page_id, folder_path, download_folder, html_template, depth=0, page_duplicate_file_names=None, page_file_matching=None, attachment_duplicate_file_names=None, attachment_file_matching=None): """ Fetches a Confluence page and its child pages (with referenced downloads). :param page_id: Confluence page id. :param folder_path: Folder to place downloaded pages in. :param download_folder: Folder to place downloaded files in. :param html_template: HTML template used to export Confluence pages. :param depth: (optional) Hierarchy depth of the handled Confluence page. :param page_duplicate_file_names: A dict in the structure {'<sanitized page filename>': amount of duplicates} :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'} :param attachment_duplicate_file_names: A dict in the structure {'<sanitized attachment filename>': amount of \ duplicates} :param attachment_file_matching: A dict in the structure {'<attachment title>': '<used offline filename>'} :returns: Information about downloaded files (pages, attachments, images, ...) as a dict (None for exceptions) """ if not page_duplicate_file_names: page_duplicate_file_names = {} if not page_file_matching: page_file_matching = {} if not attachment_duplicate_file_names: attachment_duplicate_file_names = {} if not attachment_file_matching: attachment_file_matching = {} page_url = '%s/rest/api/content/%s?expand=children.page,children.attachment,body.view.value' \ % (settings.CONFLUENCE_BASE_URL, page_id) try: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) page_content = response['body']['view']['value'] page_title = response['title'] print('%sPAGE: %s (%s)' % ('\t' * (depth + 1), page_title, page_id)) # Construct unique file name file_name = provide_unique_file_name(page_duplicate_file_names, page_file_matching, str(page_id), explicit_file_extension='html') # Remember this file and all children path_collection = { 'file_path': file_name, 'page_title': page_title, 'child_pages': [], 'child_attachments': [] } # Download attachments of this page # TODO: Outsource/Abstract the following two while loops because of much duplicate code. page_url = '%s/rest/api/content/%s/child/attachment?limit=25' % ( settings.CONFLUENCE_BASE_URL, page_id) counter = 0 while page_url: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) counter += len(response['results']) for attachment in response['results']: download_url = attachment['_links']['download'] attachment_id = attachment['id'][3:] attachment_info = download_attachment( download_url, download_folder, attachment_id, attachment_duplicate_file_names, attachment_file_matching, depth=depth + 1) path_collection['child_attachments'].append(attachment_info) if 'next' in response['_links'].keys(): page_url = response['_links']['next'] page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url) else: page_url = None # Export HTML file page_content = handle_html_references(page_content, page_duplicate_file_names, page_file_matching, depth=depth + 1) file_path = '%s/%s' % (folder_path, file_name) page_content += create_html_attachment_index( path_collection['child_attachments']) utils.write_html_2_file(file_path, page_title, page_content, html_template) # Save another file with page id which forwards to the original one id_file_path = '%s/%s.html' % (folder_path, page_id) id_file_page_title = 'Forward to page %s' % page_title original_file_link = utils.encode_url( utils.sanitize_for_filename(file_name)) # id_file_page_content = settings.HTML_FORWARD_MESSAGE % (original_file_link, page_title) # id_file_forward_header = '<meta http-equiv="refresh" content="0; url=%s" />' % original_file_link # utils.write_html_2_file(id_file_path, id_file_page_title, id_file_page_content, html_template, # additional_headers=[id_file_forward_header]) # Iterate through all child pages page_url = '%s/rest/api/content/%s/child/page?limit=25' % ( settings.CONFLUENCE_BASE_URL, page_id) counter = 0 while page_url: response = utils.http_get( page_url, auth=settings.HTTP_AUTHENTICATION, headers=settings.HTTP_CUSTOM_HEADERS, verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE, proxies=settings.HTTP_PROXIES) counter += len(response['results']) for child_page in response['results']: paths = fetch_page_recursively( child_page['id'], folder_path, download_folder, html_template, depth=depth + 1, page_duplicate_file_names=page_duplicate_file_names, page_file_matching=page_file_matching) if paths: path_collection['child_pages'].append(paths) if 'next' in response['_links'].keys(): page_url = response['_links']['next'] page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url) else: page_url = None return path_collection except utils.ConfluenceException as e: error_print('%sERROR: %s' % ('\t' * (depth + 1), e)) return None
def handle_html_references(html_content, page_duplicate_file_names, page_file_matching, depth=0): """ Repairs links in the page contents with local links. :param html_content: Confluence HTML content. :param page_duplicate_file_names: A dict in the structure {'<sanitized filename>': amount of duplicates} :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'} :param depth: (optional) Hierarchy depth of the handled Confluence page. :returns: Fixed HTML content. """ try: html_tree = html.fromstring(html_content) except ParserError: print('page is empty') return html_content except XMLSyntaxError: print( '%sWARNING: Could not parse HTML content of last page. Original content will be downloaded as it is.' % ('\t' * (depth + 1))) return html_content # Fix links to other Confluence pages # Example: /display/TES/pictest1 # => pictest1.html # TODO: This code does not work for "Recent space activity" areas in space pages because of a different url format. xpath_expr = '//a[contains(@href, "/display/")]' for link_element in html_tree.xpath(xpath_expr): if not link_element.get('class'): page_title = link_element.attrib['href'].split('/')[3] page_title = page_title.replace('+', ' ') decoded_page_title = utils.decode_url(page_title) offline_link = provide_unique_file_name( page_duplicate_file_names, page_file_matching, decoded_page_title, explicit_file_extension='html') link_element.attrib['href'] = utils.encode_url(offline_link) # Fix links to other Confluence pages when page ids are used xpath_expr = '//a[contains(@href, "/pages/viewpage.action?pageId=")]' for link_element in html_tree.xpath(xpath_expr): if not link_element.get('class'): page_id = link_element.attrib['href'].split( '/pages/viewpage.action?pageId=')[1] offline_link = '%s.html' % utils.sanitize_for_filename(page_id) link_element.attrib['href'] = utils.encode_url(offline_link) # Fix attachment links xpath_expr = '//a[contains(@class, "confluence-embedded-file")]' for link_element in html_tree.xpath(xpath_expr): file_url = link_element.attrib['href'] file_name = derive_downloaded_file_name(file_url) relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER, file_name) #link_element.attrib['href'] = utils.encode_url(relative_file_path) link_element.attrib['href'] = relative_file_path # Fix file paths for img tags # TODO: Handle non-<img> tags as well if necessary. # TODO: Support files with different versions as well if necessary. possible_image_xpaths = [ '//img[contains(@src, "/download/")]', '//img[contains(@src, "/rest/documentConversion/latest/conversion/thumbnail/")]' ] xpath_expr = '|'.join(possible_image_xpaths) for img_element in html_tree.xpath(xpath_expr): # Replace file path file_url = img_element.attrib['src'] file_name = derive_downloaded_file_name(file_url) relative_file_path = '%s/%s' % (settings.DOWNLOAD_SUB_FOLDER, file_name) img_element.attrib['src'] = relative_file_path # Add alt attribute if it does not exist yet if not 'alt' in img_element.attrib.keys(): img_element.attrib['alt'] = relative_file_path return html.tostring(html_tree)