Пример #1
0
 def __init__(self, state, base_output_path, current_time, file_format,
              content=None):
     self.state = state
     self.regex = STATION_REGEX
     url = STATE_URL % state
     self.content = content if content else utils.download_content(url)
     super(StateParser, self).__init__(
         base_output_path,
         current_time,
         file_format
     )
 def download_chapters_list(self):
     html = download_content(self.index_iri)
     page_prefix = self.index_iri.replace("all.html", "")
     search = re.findall(r'<a style="" href=".*?<\/a>', html)
     if search:
         for title_html in search:
             self.chapter_list.append(Chapter(title_html, page_prefix))
     else:
         logging.error(
             f'Failed to fetch this books chapter list from {self.index_iri}'
         )
     # TODO delete this line
     self.chapter_list = self.chapter_list[2088:]
     return search
 def download_chapters_content(self):
     # To avoid being detected as a script(?)
     random.seed()
     #Create directory where to save chapters
     Path(f"{self.title}").mkdir(parents=True, exist_ok=True)
     abs_file_path = os.path.abspath(f'.\\{self.title}')
     for chapter in self.chapter_list:
         chapter.process_raw_title()
         logging.info(f'Downloading {chapter.title}')
         print(f'Downloading {chapter.title}')
         chapter.raw_content = download_content(chapter.link)
         chapter.extract_chapter()
         chapter.clean_chapter()
         chapter.save_chapter(abs_file_path)
         time.sleep(random.randrange(5))
Пример #4
0
    def __init__(self, base_output_path, current_time, file_format,
                 content=None):
        """ Initialise the MainParser.

        :param base_output_path: base path to store files.
        :param current_time: current time to use for file names.
        :param file_format: format to store the files.
        :param content: content of the main page.
        """
        self.regex = STATE_REGEX
        self.content = content if content else utils.download_content(MAIN_URL)
        super(MainParser, self).__init__(
            base_output_path,
            current_time,
            file_format
        )
Пример #5
0
 def download_data(self):
     for station in self.get_match():
         logger.debug("Processing station %s", station)
         url = STATION_URL % (station, self.state, station)
         filename = self._get_station_filename(station)
         utils.download_content(url, filename)