def process_text(original_text, zim_archive_path): """ Core function, process the whole text """ link = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#~]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') #get a copy without noarchive stuffs copy_text = strip_noarchive(original_text) #get all URLs urls = link.findall(copy_text) for url in urls: #Is it already archived? logging.debug('url: ' + str(url)) if not editline.link_archive_status(url, copy_text): file_uuid = uuid.uuid4() html_file_path = os.path.join(str(zim_archive_path), str(file_uuid) + ".html" ) try: #archive.make_archive(html_file_path, url) #new version: archive.make_archive_thread(zim_archive_path, file_uuid, url) except archive.URLError: logging.error('URLError: ' + str(url)) #TODO pass else: #We successfully get the page #We change the line logging.debug('Add label') original_text = editline.add_label(html_file_path, url, original_text) else: logging.debug('Already archived') return original_text
def process_text(original_text, zim_archive_path): """ Core function, process the whole text: * Look for the URLs in the text * Download the content * Add internal links in the text The function returns a status (bool) indicating if true that something goes wrong; and the remplacing text. :returns: Tuple (boolean, string) """ urls = _get_unarchived_urls(original_text) errors = False for url in urls: #Is it already archived? logger.debug('url: ' + str(url)) if not editline.link_archive_status(url, copy_text): file_uuid = uuid.uuid4() try: #archive.make_archive(html_file_path, url) #new version: extension = archive.make_archive_thread(zim_archive_path, file_uuid, url) except archive.URLError: logger.error('URLError: ' + str(url) + ' Not archived.') errors = True else: #We successfully get the page #We change the line logger.debug('Add label') file_path = os.path.join(str(zim_archive_path), str(file_uuid) + str(extension) ) original_text = editline.add_label(file_path, url, original_text) else: logger.debug('Already archived') return (errors, original_text)