def main():
    """ Main function to start the confluence-dumper. """

    # Configure console for unicode output via stdout/stderr
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf-8')(sys.stderr)

    # Welcome output
    print_welcome_output()

    # Delete old export
    if os.path.exists(settings.EXPORT_FOLDER):
        shutil.rmtree(settings.EXPORT_FOLDER)
    os.makedirs(settings.EXPORT_FOLDER)

    # Read HTML template
    template_file = open(settings.TEMPLATE_FILE)
    html_template = template_file.read()

    # Fetch all spaces if spaces were not configured via settings
    if len(settings.SPACES_TO_EXPORT) > 0:
        spaces_to_export = settings.SPACES_TO_EXPORT
    else:
        spaces_to_export = []
        page_url = '%s/rest/api/space?limit=25' % settings.CONFLUENCE_BASE_URL
        while page_url:
            response = utils.http_get(
                page_url,
                auth=settings.HTTP_AUTHENTICATION,
                headers=settings.HTTP_CUSTOM_HEADERS,
                verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
                proxies=settings.HTTP_PROXIES)
            for space in response['results']:
                spaces_to_export.append(space['key'])

            if 'next' in response['_links'].keys():
                page_url = response['_links']['next']
                page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url)
            else:
                page_url = None

    print('Exporting %d space(s): %s\n' %
          (len(spaces_to_export), ', '.join(spaces_to_export)))

    # Export spaces
    space_counter = 0
    duplicate_space_names = {}
    space_matching = {}
    for space in spaces_to_export:
        space_counter += 1

        # Create folders for this space
        space_folder_name = provide_unique_file_name(duplicate_space_names,
                                                     space_matching,
                                                     space,
                                                     is_folder=True)
        space_folder = '%s/%s' % (settings.EXPORT_FOLDER, space_folder_name)
        try:
            os.makedirs(space_folder)
            download_folder = '%s/%s' % (space_folder,
                                         settings.DOWNLOAD_SUB_FOLDER)
            os.makedirs(download_folder)

            space_url = '%s/rest/api/space/%s?expand=homepage' % (
                settings.CONFLUENCE_BASE_URL, space)
            response = utils.http_get(
                space_url,
                auth=settings.HTTP_AUTHENTICATION,
                headers=settings.HTTP_CUSTOM_HEADERS,
                verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
                proxies=settings.HTTP_PROXIES)
            space_name = response['name']

            print('SPACE (%d/%d): %s (%s)' %
                  (space_counter, len(spaces_to_export), space_name, space))

            space_page_id = settings.from_page_id or response['homepage']['id']
            path_collection = fetch_page_recursively(space_page_id,
                                                     space_folder,
                                                     download_folder,
                                                     html_template)

            if path_collection:
                # Create index file for this space
                space_index_path = '%s/index.html' % space_folder
                space_index_title = 'Index of Space %s (%s)' % (space_name,
                                                                space)
                space_index_content = create_html_index(path_collection)
                utils.write_html_2_file(space_index_path, space_index_title,
                                        space_index_content, html_template)
        except utils.ConfluenceException as e:
            error_print('ERROR: %s' % e)
        except OSError:
            print(
                'WARNING: The space %s has been exported already. Maybe you mentioned it twice in the settings'
                % space)

    # Finished output
    print_finished_output()
def fetch_page_recursively(page_id,
                           folder_path,
                           download_folder,
                           html_template,
                           depth=0,
                           page_duplicate_file_names=None,
                           page_file_matching=None,
                           attachment_duplicate_file_names=None,
                           attachment_file_matching=None):
    """ Fetches a Confluence page and its child pages (with referenced downloads).

    :param page_id: Confluence page id.
    :param folder_path: Folder to place downloaded pages in.
    :param download_folder: Folder to place downloaded files in.
    :param html_template: HTML template used to export Confluence pages.
    :param depth: (optional) Hierarchy depth of the handled Confluence page.
    :param page_duplicate_file_names: A dict in the structure {'<sanitized page filename>': amount of duplicates}
    :param page_file_matching: A dict in the structure {'<page title>': '<used offline filename>'}
    :param attachment_duplicate_file_names: A dict in the structure {'<sanitized attachment filename>': amount of \
                                            duplicates}
    :param attachment_file_matching: A dict in the structure {'<attachment title>': '<used offline filename>'}
    :returns: Information about downloaded files (pages, attachments, images, ...) as a dict (None for exceptions)
    """
    if not page_duplicate_file_names:
        page_duplicate_file_names = {}
    if not page_file_matching:
        page_file_matching = {}
    if not attachment_duplicate_file_names:
        attachment_duplicate_file_names = {}
    if not attachment_file_matching:
        attachment_file_matching = {}

    page_url = '%s/rest/api/content/%s?expand=children.page,children.attachment,body.view.value' \
               % (settings.CONFLUENCE_BASE_URL, page_id)
    try:
        response = utils.http_get(
            page_url,
            auth=settings.HTTP_AUTHENTICATION,
            headers=settings.HTTP_CUSTOM_HEADERS,
            verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
            proxies=settings.HTTP_PROXIES)
        page_content = response['body']['view']['value']

        page_title = response['title']
        print('%sPAGE: %s (%s)' % ('\t' * (depth + 1), page_title, page_id))

        # Construct unique file name
        file_name = provide_unique_file_name(page_duplicate_file_names,
                                             page_file_matching,
                                             str(page_id),
                                             explicit_file_extension='html')

        # Remember this file and all children
        path_collection = {
            'file_path': file_name,
            'page_title': page_title,
            'child_pages': [],
            'child_attachments': []
        }

        # Download attachments of this page
        # TODO: Outsource/Abstract the following two while loops because of much duplicate code.
        page_url = '%s/rest/api/content/%s/child/attachment?limit=25' % (
            settings.CONFLUENCE_BASE_URL, page_id)
        counter = 0
        while page_url:
            response = utils.http_get(
                page_url,
                auth=settings.HTTP_AUTHENTICATION,
                headers=settings.HTTP_CUSTOM_HEADERS,
                verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
                proxies=settings.HTTP_PROXIES)
            counter += len(response['results'])
            for attachment in response['results']:
                download_url = attachment['_links']['download']
                attachment_id = attachment['id'][3:]
                attachment_info = download_attachment(
                    download_url,
                    download_folder,
                    attachment_id,
                    attachment_duplicate_file_names,
                    attachment_file_matching,
                    depth=depth + 1)
                path_collection['child_attachments'].append(attachment_info)

            if 'next' in response['_links'].keys():
                page_url = response['_links']['next']
                page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url)
            else:
                page_url = None

        # Export HTML file
        page_content = handle_html_references(page_content,
                                              page_duplicate_file_names,
                                              page_file_matching,
                                              depth=depth + 1)
        file_path = '%s/%s' % (folder_path, file_name)
        page_content += create_html_attachment_index(
            path_collection['child_attachments'])
        utils.write_html_2_file(file_path, page_title, page_content,
                                html_template)

        # Save another file with page id which forwards to the original one
        id_file_path = '%s/%s.html' % (folder_path, page_id)
        id_file_page_title = 'Forward to page %s' % page_title
        original_file_link = utils.encode_url(
            utils.sanitize_for_filename(file_name))
        #  id_file_page_content = settings.HTML_FORWARD_MESSAGE % (original_file_link, page_title)
        #  id_file_forward_header = '<meta http-equiv="refresh" content="0; url=%s" />' % original_file_link
        #  utils.write_html_2_file(id_file_path, id_file_page_title, id_file_page_content, html_template,
        #  additional_headers=[id_file_forward_header])

        # Iterate through all child pages
        page_url = '%s/rest/api/content/%s/child/page?limit=25' % (
            settings.CONFLUENCE_BASE_URL, page_id)
        counter = 0
        while page_url:
            response = utils.http_get(
                page_url,
                auth=settings.HTTP_AUTHENTICATION,
                headers=settings.HTTP_CUSTOM_HEADERS,
                verify_peer_certificate=settings.VERIFY_PEER_CERTIFICATE,
                proxies=settings.HTTP_PROXIES)
            counter += len(response['results'])
            for child_page in response['results']:
                paths = fetch_page_recursively(
                    child_page['id'],
                    folder_path,
                    download_folder,
                    html_template,
                    depth=depth + 1,
                    page_duplicate_file_names=page_duplicate_file_names,
                    page_file_matching=page_file_matching)
                if paths:
                    path_collection['child_pages'].append(paths)

            if 'next' in response['_links'].keys():
                page_url = response['_links']['next']
                page_url = '%s%s' % (settings.CONFLUENCE_BASE_URL, page_url)
            else:
                page_url = None
        return path_collection

    except utils.ConfluenceException as e:
        error_print('%sERROR: %s' % ('\t' * (depth + 1), e))
        return None