Пример #1
0
def process_text(original_text, zim_archive_path):
    """
    Core function, process the whole text
    """
    link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    #get a copy without noarchive stuffs
    copy_text = strip_noarchive(original_text)

    #get all URLs
    urls = link.findall(copy_text)

    for url in urls:
        #Is it already archived?
        logging.debug('url: ' + str(url))
        if not editline.link_archive_status(url, copy_text):
            file_uuid = uuid.uuid4()
            html_file_path = os.path.join(str(zim_archive_path), str(file_uuid) + ".html" )
            try:
                #archive.make_archive(html_file_path, url)
                #new version:
                archive.make_archive_thread(zim_archive_path, file_uuid, url)
            except archive.URLError:
                logging.error('URLError: ' + str(url))
                #TODO
                pass
            else:
                #We successfully get the page
                #We change the line
                logging.debug('Add label')
                original_text = editline.add_label(html_file_path, url, original_text)
        else:
            logging.debug('Already archived')
    return original_text
Пример #2
0
def process_text(original_text, zim_archive_path):
    """
    Core function, process the whole text:
    * Look for the URLs in the text
    * Download the content
    * Add internal links in the text

    The function returns a status (bool) indicating if true
    that something goes wrong; and the remplacing text.

    :returns: Tuple (boolean, string)
    """

    urls = _get_unarchived_urls(original_text)
    errors = False

    for url in urls:
        #Is it already archived?
        logger.debug('url: ' + str(url))
        if not editline.link_archive_status(url, copy_text):
            file_uuid = uuid.uuid4()
            try:
                #archive.make_archive(html_file_path, url)
                #new version:
                extension = archive.make_archive_thread(zim_archive_path, file_uuid, url)
            except archive.URLError:
                logger.error('URLError: ' + str(url) + ' Not archived.')
                errors = True
            else:
                #We successfully get the page
                #We change the line
                logger.debug('Add label')
                file_path = os.path.join(str(zim_archive_path), str(file_uuid) + str(extension) )
                original_text = editline.add_label(file_path, url, original_text)
        else:
            logger.debug('Already archived')
    return (errors, original_text)