예제 #1
0
 def traverse(self, header, element):
     contents = []
     for child in element.children:
         if type(child) is Tag:
             if child.name == 'p':
                 contents.extend(self.traverse(header, child))
             elif child.name == 'b':
                 contents.append(
                     TextBlogDownloadContent(header, child.get_text()))
             elif (child.name == 'br'):
                 contents.append(TextBlogDownloadContent(header, ''))
             elif (child.name == 'a'):
                 href = child.get('href')
                 if 'http://dcimg.awalker.jp' in href:
                     contents.append(
                         SessionBasedImageBlogDownloadContent(
                             header, (href, 'original_image'), child))
                 else:
                     contents.append(
                         TextBlogDownloadContent(
                             header, f"{child.get_text()} ({href})"))
             elif (child.name == 'img'):
                 generated = self.get_generated_link(child.get('src'))
                 if (len(generated) > 0):
                     contents.append(
                         ImageBlogDownloadContent(header, generated))
             elif (child.name == 'div' or child.name == 'span'):
                 contents.extend(self.traverse(header, child))
         elif type(child) is NavigableString:
             contents.append(TextBlogDownloadContent(header, child))
     return contents
예제 #2
0
 def traverse(self, header, element):
     contents = []
     children = element.find_all(['img', 'a'])
     for child in children:
         if type(child) is Tag:
             if child.name == 'img':
                 generated = self.get_generated_link(child.get('src'))
                 if (generated is not None):
                     contents.append(
                         ImageBlogDownloadContent(header, generated))
             elif child.name == 'a':
                 generated = self.get_generated_link(child.get('href'))
                 if (generated is not None):
                     contents.append(
                         ImageBlogDownloadContent(header, generated))
     return contents
예제 #3
0
 def download_to_file(self, directory, index, on_save, on_except):
     ( image_url ) = self.content
     if (image_url and not image_url == ''):
         self.logger.debug(f'Image url is not empty. Building download path from {image_url}.')
         bit_content = self.get_bit_content()
         if bit_content is not None:
             download_url = self.format_download_url(directory, self.header.title, index)
             self.save_to_file(directory, download_url, bit_content, index, on_save, on_except)
         else:
             smaller_image = self.element.find('img')
             if (smaller_image is not None):
                 ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_file(directory, index, on_save, on_except)
예제 #4
0
 def traverse(self, header, element):
     contents = []
     children = element.children
     for child in children:
         if type(child) is NavigableString:
             contents.append(TextBlogDownloadContent(header, child))
         elif type(child) is Tag:
             if child.name == 'p':
                 contents.extend(self.traverse(header, child))
             elif child.name == 'b':
                 contents.append(
                     TextBlogDownloadContent(header, child.get_text()))
             elif child.name == 'img':
                 contents.append(
                     ImageBlogDownloadContent(header, child.get('src')))
             elif child.name == 'br':
                 contents.append(TextBlogDownloadContent(header, ''))
             elif child.name == 'a':
                 href = child.get('href')
                 if check_valid_url_format(href):
                     request = requests.get(href, allow_redirects=True)
                     if (imghdr.what(None, request.content)
                             in VALID_PHOTO_EXTENSIONS):
                         contents.append(
                             ImageBlogDownloadContent(
                                 header, child.get('href')))
                     else:
                         contents.append(
                             TextBlogDownloadContent(
                                 header, f"{child.get_text()} ({href})"))
                 else:
                     contents.append(
                         TextBlogDownloadContent(
                             header, f"{child.get_text()} ({href})"))
             elif child.name == 'div':
                 contents.extend(self.traverse(header, child))
                 contents.append(TextBlogDownloadContent(header, ''))
             elif child.name == 'span':
                 contents.extend(self.traverse(header, child))
     return contents
예제 #5
0
 def format_download_url(self, directory, title, index):
     header_date_string = self.header.date_to_string()
     bit_content = self.get_bit_content()
     if bit_content is not None:
         guessed_ext = get_extension_for_bit_content(bit_content)
         self.logger.debug(f'Extension for image URL ({self.content[0]}): {guessed_ext}')
         download_url = join(directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext))
         self.logger.debug(f'Download path for image URL {self.content[0]} created: {download_url}')
         return download_url
     else:
         smaller_image = self.element.find('img')
         if (smaller_image is not None):
             return ImageBlogDownloadContent(self.header, smaller_image.get('src')).format_download_url(directory, title, index)
예제 #6
0
 def download_to_document(self, document):
     ( image_url ) = self.content
     if (image_url and not image_url == ''):
         try:
             bit_content = self.get_bit_content()
             if bit_content is not None:
                 image = io.BytesIO(bit_content)
                 document.add_picture(image, width=Inches(4))
             else:
                 smaller_image = self.element.find('img')
                 if (smaller_image is not None):
                     ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_document(document)
         except Exception:
             document.add_paragraph(image_url)
             self.logger.debug(f'Unable to fetch {image_url}. The URL was added instead.')