Пример #1
0
class BemihoResetProcessor(BemihoProcessor):
    def __init__(self, user_input):
        super().__init__(user_input)
        self.group = user_input.group
        self.member = user_input.member
        self.output = user_input.output
        self.output_path = self.format_path()
        self.logger = BemihoLogger(self.__class__).get_logger()

    def format_path(self):
        group = self.group.kanji
        group_romaji = self.group.romaji
        member = self.member.kanji
        member_romaji = self.member.romaji
        return join(self.output, f"{group} ({group_romaji})", f"{member} ({member_romaji})")

    def start(self):
        self.logger.debug(f'Starting reset for member {self.member.kanji} ({self.member.romaji}) from {self.group.kanji} ({self.group.romaji}) located on {self.output_path}')
        if exists(self.output_path):
            self.logger.debug(f'Output path located. Resetting.')
            try:
                for file_path in os.listdir(self.output_path):
                    joined_file_path = join(self.output_path, file_path)
                    if os.path.isfile(joined_file_path):
                        os.unlink(joined_file_path)
                    elif os.path.isdir(joined_file_path):
                        shutil.rmtree(joined_file_path)
                self.logger.debug(f'Reset successful for {self.output_path}')
            except Exception:
                self.logger.error(f'Unable to reset due to an unexpected error.', exc_info=True)
        else:
            self.logger.debug(f'Output path doesn\'t exist. Terminating')
Пример #2
0
class NoHTMLTextOutputProcessor(ScrapperOutputProcessor):
    content = 'no_html'

    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def get_metadata_handler_class(self, user_input, member_path):
        return NoHTMLTextMetadataHandler(user_input, member_path)

    def process_blog_data(self, blog_datas):
        self.logger.debug(f'Blog data number {len(blog_datas)}.')
        for blog_data in blog_datas:
            header = blog_data.header
            contents = blog_data.contents
            self.logger.debug(
                f'Saving text only contents from {header.title} with content count {len(contents)}.'
            )
            for download_content in contents:
                self.save_to_file_and_metadata(header, download_content)
            self.metadata_handler.save_metadata()

    def on_save(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = True
        self.metadata_handler.add_to_metadata(header, content_data)

    def on_except(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = False
        self.metadata_handler.add_to_metadata(header, content_data)

    def save_to_file_and_metadata(self, header, download_content):
        download_url = download_content.get_text_file_path(self.member_path)
        try:
            content_data = self.metadata_handler.build_content_object_from_data(
                download_url=download_url, successful=False)
            if not self.metadata_handler.check_duplicates(
                    header, content_data):
                download_content.download_to_text_file(
                    self.member_path, lambda file_path: self.on_save(
                        header, content_data, file_path), lambda file_path:
                    self.on_except(header, content_data, file_path))
            else:
                self.logger.debug(
                    f'Duplicate found for {header.title}. Cancelling download')
        except:
            self.logger.error(
                f'Download of no_html from {header.link} to {download_url} is unsuccessful due to issue.',
                exc_info=True)
Пример #3
0
class PhotosOutputProcessor(ScrapperOutputProcessor):
    content = 'photos'

    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def get_metadata_handler_class(self, user_input, member_path):
        return PhotosMetadataHandler(user_input, member_path)

    def process_blog_data(self, blog_datas):
        self.logger.debug(f'Starting saving photos content to {self.member_path}.')
        self.logger.debug(f'Blog data number {len(blog_datas)}.')
        for blog_data in blog_datas:
            header = blog_data.header
            contents = list(filter(lambda content: type(content) is ImageBlogDownloadContent or type(content) is SessionBasedImageBlogDownloadContent, blog_data.contents))
            self.logger.debug(f'Saving contents from {header.title} with content count {len(contents)}.')
            for (index, download_content) in enumerate(contents):
                self.download_file(header, index, download_content)
        self.metadata_handler.save_metadata()

    def on_save(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = True
        self.metadata_handler.add_to_metadata(header, content_data)

    def on_except(self, header, content_data, file_path):
        content_data.download_url = file_path
        content_data.successful = False
        self.metadata_handler.add_to_metadata(header, content_data)

    def download_file(self, header, index, download_content):
        image_url = download_content.content
        download_url = download_content.format_download_url(self.member_path, header.title, index)
        metadata_content = self.metadata_handler.build_content_object_from_data(image_url=image_url, download_url=download_url, successful=True)
        try:
            if self.metadata_handler.check_duplicates(header, metadata_content):
                self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.')
            else:
                metadata_content.download_url = download_content.format_download_url(self.member_path, clean_file_name(header.title), index)
                if self.metadata_handler.check_duplicates(header, metadata_content):
                    self.logger.debug(f'Duplicate found. Download from {image_url} to {download_url} is cancelled.')
                else:
                    download_content.download_to_file(self.member_path, index,
                        lambda file_path : self.on_save(header, metadata_content, file_path),
                        lambda file_path : self.on_except(header, metadata_content, file_path))
                    self.metadata_handler.add_to_metadata(header, metadata_content)
        except Exception:
            self.logger.error(f'Download from {image_url} to {download_url} is unsuccessful due to issue.', exc_info=True)
Пример #4
0
class AllOutputProcessor(ScrapperOutputProcessor):
    content = 'all'
    
    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()
        other_processors = []
        self.logger.debug('Getting other output processors for all context implementation.')
        self.other_processors_blog_datas = {}
        for output_p in get_output_processor_classes_for_content_except(self.content):
            other_processors.append(output_p(user_input))
            self.other_processors_blog_datas[output_p.content] = []
        self.other_processors = other_processors
        self.logger.debug(f'Found the following other output processor classes: {other_processors}')

    def get_metadata_handler_class(self, user_input, member_path):
        pass

    def create_output_directory(self):
        for processor in self.other_processors:
            processor.create_output_directory()

    def do_blog_datas_remapping(self, blog_datas):
        self.logger.debug('Performing remapping for blog data for performing output processor for all.')
        for blog_data in blog_datas:
            header = blog_data.header
            contents = blog_data.contents
            for content in contents:
                for content_key in content.keys():
                    self.other_processors_blog_datas[content_key].append(BlogData(header, content[content_key]))

    def process_blog_data(self, blog_datas):
        self.logger.debug('Starting blog data processing for all processor. One thread is created per output processor.')
        self.do_blog_datas_remapping(blog_datas)
        with ThreadPoolExecutor(max_workers=3) as executor:
            futures = []
            for processor in self.other_processors:
                self.logger.debug(f'Starting thread execution for processing {processor.content} content.')
                futures.append(executor.submit(processor.process_blog_data, self.other_processors_blog_datas[processor.content]))
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception:
                    self.logger.error("Exception occurred on thread", exc_info=True)
Пример #5
0
class BemihoScrapProcessor(BemihoProcessor):
    def __init__(self, user_input, output_processor_class):
        self.user_input = user_input
        self.traversal = get_traversal_based_on_content_request(user_input)
        self.scrapper_class = get_scrapper_class_based_on_input(user_input)

        self.output_processor = output_processor_class(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def execute_single_scraper(self, page_number):
        content = self.user_input.content
        self.logger.debug(f'Starting fetch {content} for page {page_number}')
        scrapper = self.scrapper_class(self.user_input, page_number,
                                       self.traversal)
        blog_data = scrapper.start_web_scrape()
        self.output_processor.process_blog_data(blog_data)
        return page_number

    def start(self):
        group = self.user_input.group
        member = self.user_input.member
        firstpage = self.user_input.firstpage
        number_of_pages = self.user_input.number_of_pages
        content = self.user_input.content
        self.logger.debug(
            f'Starting scrap process for {member.kanji} ({member.romaji}) from {group.kanji} ({group.romaji}) with content {content} and {number_of_pages} page count from page {firstpage}'
        )
        self.output_processor.create_output_directory()
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            page_index = self.scrapper_class.get_proper_page_index(firstpage)
            for page_number in range(page_index, page_index + number_of_pages):
                futures.append(
                    executor.submit(self.execute_single_scraper, page_number))
            for future in as_completed(futures):
                try:
                    data = future.result()
                    self.logger.debug(
                        f"Successfully fetched {content} data for page {data}")
                except Exception:
                    self.logger.error("Exception occurred on thread",
                                      exc_info=True)
Пример #6
0
class LineBlogGroupService(LineBlogService):
    def __init__(self, url, page_number, author, traversal):
        self.url = url
        self.page_number = page_number
        self.author = author
        self.logger = BemihoLogger(self.__class__).get_logger()
        self.traversal = traversal

    def scrape_single_url(self, header):
        contents = []
        self.logger.debug(
            f'Extracting data from {header.link} from {header.author}')
        request = requests.get(header.link)
        soup = BeautifulSoup(request.text, 'lxml')
        for article in soup.find_all('article', class_='first-article'):
            article_body = article.find('div', class_='article-body')
            article_body_inner = article_body.find('div',
                                                   class_='article-body-inner')
            contents = self.traversal.traverse(header, article_body_inner)
            self.logger.debug(
                f'Contents extracted from {header.link} with size {len(contents)}'
            )
        return BlogData(header, contents)

    def serve_contents(self):
        contents = []
        futures = []
        headers = LineBlogApiCrawler(self.url, self.page_number,
                                     self.author).crawl_api_for_headers()
        self.logger.debug(
            f'Headers extracted from api url {self.url} with size {len(headers)}. Proceeding to fetch data.'
        )
        with ThreadPoolExecutor(max_workers=5) as executor:
            for header in headers:
                futures.append(executor.submit(self.scrape_single_url, header))
            for future in as_completed(futures):
                try:
                    contents.append(future.result())
                except Exception:
                    self.logger.error("Exception occurred on thread",
                                      exc_info=True)
        return contents
Пример #7
0
class BlogEntryOutputProcessor(ScrapperOutputProcessor):
    content = 'blog'

    def __init__(self, user_input):
        super().__init__(user_input)
        self.logger = BemihoLogger(self.__class__).get_logger()

    def get_metadata_handler_class(self, user_input, member_path):
        return BlogMetadataHandler(user_input, member_path)

    def process_blog_data(self, blog_datas):
        self.logger.debug(
            f'Starting saving blog content to {self.member_path}.')
        directory = self.member_path
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = []
            for blog_data in blog_datas:
                self.logger.debug(
                    f'Starting thread execution for building document.')
                futures.append(
                    executor.submit(self.build_document, directory, blog_data))
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception:
                    self.logger.error("Exception occurred on thread",
                                      exc_info=True)
        self.metadata_handler.save_metadata()

    def build_document(self, directory, blog_data):
        content_data = None
        header = blog_data.header
        contents = blog_data.contents
        date_string = header.date.strftime("%Y.%m.%d")
        document_path = join(
            directory,
            f"{date_string} ({clean_file_separators(header.title)}).docx")

        try:
            content_data = self.metadata_handler.build_content_object_from_data(
                download_url=document_path, successful=False)
            self.save_to_document(header, contents, content_data,
                                  document_path)
        except OSError as os_error:
            if os_error.errno == errno.EILSEQ:
                document_path = join(
                    directory,
                    f"{date_string} ({clean_file_name(header.title)}).docx")
                content_data = self.metadata_handler.build_content_object_from_data(
                    download_url=document_path, successful=False)
                self.save_to_document(header, contents, content_data,
                                      document_path)
            else:
                raise os_error
        except:
            content_data = self.metadata_handler.build_content_object_from_data(
                download_url=document_path, successful=False)
            self.metadata_handler.add_to_metadata(header, content_data)
            self.logger.error(
                f'Download from {header.link} to {document_path} is unsuccessful due to issue.',
                exc_info=True)

    def save_to_document(self, header, contents, content_data, document_path):
        if not self.metadata_handler.check_duplicates(header, content_data):
            document = Document()
            paragraph_format = document.styles['Normal'].paragraph_format
            paragraph_format.line_spacing = 1

            HeaderDocumentModifier(header.title,
                                   level=1).change_document(document)
            HeaderDocumentModifier(header.date.strftime("%Y-%m-%d %H:%M:%S"),
                                   level=4).change_document(document)
            HeaderDocumentModifier(header.link,
                                   level=4).change_document(document)

            for content in contents:
                content.download_to_document(document)
            document.save(document_path)
            content_data.successful = True
            self.metadata_handler.add_to_metadata(header, content_data)
Пример #8
0
from output_processor.exceptions import OutputProcessorNotFound
from scrapper.traversal.exceptions import TraversalClassNotFound

from processor import create_bemiho_processor
from utilities.text import seconds_to_minutes_format

if __name__ == '__main__':
    logger = BemihoLogger('bemiho').get_logger()
    start = time.time()
    try:
        logger.info('Starting Bemiho.')
        user_input = get_user_input()
        processor = create_bemiho_processor(user_input)
        processor.start()
    except (JSONDataNotFound, PageNumberNotDigits, NumberOfPageShouldBeAtLeastOne, InvalidContentInput):
        logger.error("There were exceptions in acquiring data", exc_info=True)
    except OutputProcessorNotFound as oe:
        logger.error(oe.message, exc_info=True)
    except TraversalClassNotFound as te:
        logger.error(te.message, exc_info=True)
    except KeyboardInterrupt as ke:
        logger.debug("User stopped the application.")
    except Exception as e:
        logger.error('Uncaught exception occurred', exc_info=True)
    finally:
        end = time.time()
        total_seconds = (end - start)
        logger.debug('Stopped Bemiho.')
        logger.info(f'Duration: {seconds_to_minutes_format(total_seconds)}')
Пример #9
0
class SessionBasedImageBlogDownloadContent(BlogDownloadContent):
    def __init__(self, header, content, element):
        super().__init__(header, content)
        self.element = element
        self.session_img_service = SessionImageService()
        self.session_img_service.start()
        self.bit_content = None
        self.logger = BemihoLogger(__class__).get_logger()

    def download_to_file(self, directory, index, on_save, on_except):
        ( image_url ) = self.content
        if (image_url and not image_url == ''):
            self.logger.debug(f'Image url is not empty. Building download path from {image_url}.')
            bit_content = self.get_bit_content()
            if bit_content is not None:
                download_url = self.format_download_url(directory, self.header.title, index)
                self.save_to_file(directory, download_url, bit_content, index, on_save, on_except)
            else:
                smaller_image = self.element.find('img')
                if (smaller_image is not None):
                    ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_file(directory, index, on_save, on_except)
    
    def format_download_url(self, directory, title, index):
        header_date_string = self.header.date_to_string()
        bit_content = self.get_bit_content()
        if bit_content is not None:
            guessed_ext = get_extension_for_bit_content(bit_content)
            self.logger.debug(f'Extension for image URL ({self.content[0]}): {guessed_ext}')
            download_url = join(directory, '%s_%s (%s)%s' % (header_date_string, index, clean_file_separators(title), guessed_ext))
            self.logger.debug(f'Download path for image URL {self.content[0]} created: {download_url}')
            return download_url
        else:
            smaller_image = self.element.find('img')
            if (smaller_image is not None):
                return ImageBlogDownloadContent(self.header, smaller_image.get('src')).format_download_url(directory, title, index)

    def save_to_file(self, directory, download_url, bit_content, index, on_save, on_except):
        try:
            with open(download_url, 'wb') as download_file:
                download_file.write(bit_content)
            on_save(download_url)
        except OSError as os_err:
            if os_err.errno == 92:
                rollback_save_url = self.format_download_url(directory, clean_file_name(self.header.title), index)
                self.logger.error(f'Download from {self.content} to {download_url} is unsuccessful due to OS issue. Will re-download with a cleaned name ({rollback_save_url}).', exc_info=True)
                self.save_to_file(directory, rollback_save_url, bit_content, index, on_save, on_except)
            else:
                on_except(download_url)
                raise os_err
        except Exception as other_error:
            on_except(download_url)
            raise other_error
    
    def download_to_document(self, document):
        ( image_url ) = self.content
        if (image_url and not image_url == ''):
            try:
                bit_content = self.get_bit_content()
                if bit_content is not None:
                    image = io.BytesIO(bit_content)
                    document.add_picture(image, width=Inches(4))
                else:
                    smaller_image = self.element.find('img')
                    if (smaller_image is not None):
                        ImageBlogDownloadContent(self.header, smaller_image.get('src')).download_to_document(document)
            except Exception:
                document.add_paragraph(image_url)
                self.logger.debug(f'Unable to fetch {image_url}. The URL was added instead.')

    def get_bit_content(self):
        if self.bit_content is None:
            ( image_url, image_selector ) = self.content
            return self.session_img_service.get_image_content(image_url, image_selector)
        else:
            return self.bit_content

    def clear(self):
        self.session_img_service.stop()
Пример #10
0
class ImageBlogDownloadContent(BlogDownloadContent):
    def __init__(self, header, content):
        super().__init__(header, content)
        self.logger = BemihoLogger(__class__).get_logger()

    def download_to_file(self, directory, index, on_save, on_except):
        image_url = self.content
        if (image_url and not image_url == ''):
            self.logger.debug(
                f'Image url is not empty. Building download path from {image_url}.'
            )
            download_url = self.format_download_url(directory,
                                                    self.header.title, index)
            self.save_to_file(directory, download_url, index, on_save,
                              on_except)

    def format_download_url(self, directory, title, index):
        image_url = self.content
        header_date_string = self.header.date_to_string()
        guessed_ext = get_extension_for_image(image_url)
        self.logger.debug(
            f'Extension for image URL ({image_url}): {guessed_ext}')
        save_url = join(
            directory,
            '%s_%s (%s)%s' % (header_date_string, index,
                              clean_file_separators(title), guessed_ext))
        self.logger.debug(
            f'Download path for image URL {image_url} created: {save_url}')
        return save_url

    def save_to_file(self, directory, download_url, index, on_save, on_except):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
        }
        try:
            request = requests.get(self.content,
                                   allow_redirects=True,
                                   headers=headers)
            with open(download_url, 'wb') as download_file:
                download_file.write(request.content)
            on_save(download_url)
        except OSError as os_err:
            if os_err.errno == errno.EILSEQ:
                rollback_save_url = self.format_download_url(
                    directory, clean_file_name(self.header.title), index)
                self.logger.error(
                    f'Download from {self.content} to {download_url} is unsuccessful due to illegal byte sequence on file name. Will re-download with a cleaned name ({rollback_save_url}).'
                )
                self.save_to_file(directory, rollback_save_url, index, on_save,
                                  on_except)
            else:
                on_except(download_url)
                raise os_err
        except Exception as other_error:
            on_except(download_url)
            raise other_error

    def download_to_document(self, document):
        image_content = self.content
        if (image_content and image_content != ''):
            try:
                response = requests.get(image_content, stream=True)
                image = io.BytesIO(response.content)
                document.add_picture(image, width=Inches(4))
            except Exception:
                document.add_paragraph(image_content)
                self.logger.debug(
                    f'Unable to fetch {image_content}. The URL was added instead.'
                )