def traverse(self, header, element): contents = [] for child in element.children: if type(child) is Tag: if child.name == 'p': contents.extend(self.traverse(header, child)) elif child.name == 'b': contents.append( TextBlogDownloadContent(header, child.get_text())) elif (child.name == 'br'): contents.append(TextBlogDownloadContent(header, '')) elif (child.name == 'a'): href = child.get('href') if 'http://dcimg.awalker.jp' in href: contents.append( SessionBasedImageBlogDownloadContent( header, (href, 'original_image'), child)) else: contents.append( TextBlogDownloadContent( header, f"{child.get_text()} ({href})")) elif (child.name == 'img'): generated = self.get_generated_link(child.get('src')) if (len(generated) > 0): contents.append( ImageBlogDownloadContent(header, generated)) elif (child.name == 'div' or child.name == 'span'): contents.extend(self.traverse(header, child)) elif type(child) is NavigableString: contents.append(TextBlogDownloadContent(header, child)) return contents
def traverse(self, header, element): contents = [] children = element.find_all(['img', 'a']) for child in children: if type(child) is Tag: if child.name == 'img': generated = self.get_generated_link(child.get('src')) if (generated is not None): contents.append( ImageBlogDownloadContent(header, generated)) elif child.name == 'a': generated = self.get_generated_link(child.get('href')) if (generated is not None): contents.append( ImageBlogDownloadContent(header, generated)) return contents
def download_to_file(self, directory, index, on_save, on_except): ( image_url ) = self.content if (image_url and not image_url == ''): self.logger.debug(f'Image url is not empty. Building download path from {image_url}.') bit_content = self.get_bit_content() if bit_content is not None: download_url = self.format_download_url(directory, self.header.title, index) self.save_to_file(directory, download_url, bit_content, index, on_save, on_except) else: smaller_image = self.element.find('img') if (smaller_image is not None): ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_file(directory, index, on_save, on_except)
def traverse(self, header, element): contents = [] children = element.children for child in children: if type(child) is NavigableString: contents.append(TextBlogDownloadContent(header, child)) elif type(child) is Tag: if child.name == 'p': contents.extend(self.traverse(header, child)) elif child.name == 'b': contents.append( TextBlogDownloadContent(header, child.get_text())) elif child.name == 'img': contents.append( ImageBlogDownloadContent(header, child.get('src'))) elif child.name == 'br': contents.append(TextBlogDownloadContent(header, '')) elif child.name == 'a': href = child.get('href') if check_valid_url_format(href): request = requests.get(href, allow_redirects=True) if (imghdr.what(None, request.content) in VALID_PHOTO_EXTENSIONS): contents.append( ImageBlogDownloadContent( header, child.get('href'))) else: contents.append( TextBlogDownloadContent( header, f"{child.get_text()} ({href})")) else: contents.append( TextBlogDownloadContent( header, f"{child.get_text()} ({href})")) elif child.name == 'div': contents.extend(self.traverse(header, child)) contents.append(TextBlogDownloadContent(header, '')) elif child.name == 'span': contents.extend(self.traverse(header, child)) return contents
def format_download_url(self, directory, title, index): header_date_string = self.header.date_to_string() bit_content = self.get_bit_content() if bit_content is not None: guessed_ext = get_extension_for_bit_content(bit_content) self.logger.debug(f'Extension for image URL ({self.content[0]}): {guessed_ext}') download_url = join(directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext)) self.logger.debug(f'Download path for image URL {self.content[0]} created: {download_url}') return download_url else: smaller_image = self.element.find('img') if (smaller_image is not None): return ImageBlogDownloadContent(self.header, smaller_image.get('src')).format_download_url(directory, title, index)
def download_to_document(self, document): ( image_url ) = self.content if (image_url and not image_url == ''): try: bit_content = self.get_bit_content() if bit_content is not None: image = io.BytesIO(bit_content) document.add_picture(image, width=Inches(4)) else: smaller_image = self.element.find('img') if (smaller_image is not None): ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_document(document) except Exception: document.add_paragraph(image_url) self.logger.debug(f'Unable to fetch {image_url}. The URL was added instead.')