Пример #1
0
 def _ensure_clean_dl_dir(self):
     for item in os.listdir(self._config.driver.download_directory):
         full_path = os.path.join(self._config.driver.download_directory,
                                  item)
         if os.path.isfile(full_path):
             logger.debug('Unlink unknown file', dict(path=full_path))
             os.unlink(full_path)
Пример #2
0
    def _wait_and_click_by_xpath(self, xpath: str):
        logger.debug(f'Wait and click on xpath "{xpath}"')
        WebDriverWait(self._driver, self._timeout).until(
            EC.presence_of_element_located((By.XPATH, xpath)))
        WebDriverWait(self._driver, self._timeout).until(
            EC.visibility_of_element_located((By.XPATH, xpath)))
        WebDriverWait(self._driver, self._timeout).until(
            EC.element_to_be_clickable((By.XPATH, xpath)))

        for i in range(10):
            try:
                self._driver.find_element_by_xpath(xpath).click()
                return

            except ElementNotInteractableException:
                logger.error('Element is not interactable as of now')
                time.sleep(4)
            except WebDriverException as e:
                if 'is not clickable at point' in e.msg and (
                        ' Other element would receive the click'
                        in e.msg  # chrome
                        or ' because another element ' in e.msg  # firefox
                ):
                    logger.error(
                        'Another element would receive the click, try again after a while'
                    )
                    time.sleep(4)
                else:
                    raise e

        raise WebDriverException(f"Unable to click on xpath \"{xpath}\"")
Пример #3
0
    def _update(self, url: str, data: dict):
        logger.debug('Patching data', dict(url=url, data=data))
        request = urllib.request.Request(
            url,
            method='PATCH',
            data=json.dumps(data).encode('UTF-8'),
            headers={'Content-Type': 'application/json; charset=UTF-8'})

        self._urlopen(request)
Пример #4
0
    def download_urls(self):
        books = {}
        for i in range(1, 6):
            done = False
            while not done:
                try:
                    self._driver.get(
                        'https://www.packtpub.com/account/my-ebooks?page={}'.
                        format(i))
                    logger.info('Loaded site, wait 5 seconds', dict(page=i))
                    time.sleep(5)

                    for elem in self._driver.find_elements_by_xpath(
                            "//div[@class = 'product-line unseen']"):
                        title = elem.find_element_by_xpath(
                            ".//div[@class='title']").text.strip().replace(
                                '[eBook]', '').strip()

                        print(title)
                        books[title] = []

                        a_elems = elem.find_elements_by_xpath(".//a")

                        for a_elem in a_elems:
                            if '/ebook_download/' not in a_elem.get_attribute(
                                    'href'
                            ) and '/code_download/' not in a_elem.get_attribute(
                                    'href'):
                                continue

                            books[title].append(a_elem.get_attribute('href'))

                    done = True
                except KeyboardInterrupt:
                    raise

                except Exception as exc:
                    einfo = sys.exc_info()
                    tb = traceback.extract_tb(einfo[2])
                    tb_str = 'An exception occured:\n  Type: %s\n  Message: %s\n\n' % \
                             (einfo[0].__name__, einfo[1])
                    for t in tb:
                        tb_str += '  File %s:%s in %s\n    %s\n' % (
                            t.filename, t.lineno, t.name, t.line)

                    for line in tb_str.splitlines(keepends=False):
                        logger.debug(line)

                    logger.error('{}'.format(exc))

            save_to_yaml(books, os.path.expanduser('~/ebooks.yml'))
Пример #5
0
    def run(self) -> int:
        driver = None
        try:
            driver = self._create_web_driver()
            return self._run(driver)
        except Exception as exc:
            einfo = sys.exc_info()
            tb = traceback.extract_tb(einfo[2])
            tb_str = 'An exception occured:\n  Type: %s\n  Message: %s\n\n' % \
                     (einfo[0].__name__, einfo[1])
            for t in tb:
                tb_str += '  File %s:%s in %s\n    %s\n' % (
                    t.filename, t.lineno, t.name, t.line)

            for line in tb_str.splitlines(keepends=False):
                logger.debug(line)

            logger.error('{}'.format(exc))

            if driver is not None:
                save_screenshot(driver,
                                self._config.driver.screenshot_directory,
                                'packt-fail')

        finally:
            if driver:
                try:
                    if self._config.wait_before_close:
                        logger.info('Sleeping 60 seconds for checking content')
                        time.sleep(60)
                except:
                    driver.quit()
                    raise

                driver.quit()

            return 1
Пример #6
0
    def download(self):

        for page in range(1, 6):
            done = False
            while not done:
                try:
                    self._driver.get(
                        'https://www.packtpub.com/account/my-ebooks?page={}'.
                        format(page))
                    logger.info('Loaded site, wait 5 seconds', dict(page=page))
                    time.sleep(5)

                    with TimeoutChanger(self, 5):
                        try:
                            self._driver.find_element_by_xpath(
                                "//a[@aria-label='dismiss cookie message']"
                            ).click()
                        except:
                            pass

                    for elem in self._driver.find_elements_by_xpath(
                            "//div[@class = 'product-line unseen']"):
                        close_elem = elem.find_element_by_xpath(
                            ".//div[contains(@class, 'toggle-product-line')]")
                        title = elem.find_element_by_xpath(
                            ".//div[@class='title']").text.strip().replace(
                                '[eBook]', '').strip()
                        price_text = elem.find_element_by_xpath(
                            ".//span[@class='uc-price']").get_attribute(
                                'innerHTML')

                        book_picture = elem.find_element_by_xpath(
                            ".//div[@class='float-left product-thumbnail toggle']//noscript"
                        ).get_attribute('innerHTML').split(' ', 3)[1].replace(
                            'src=', '').replace('"', '')

                        logger.info('Processing book',
                                    dict(title=title, page=page))

                        if self._is_downloaded(title):
                            logger.info('Book is already downloaded')
                            continue

                        self._ensure_clean_dl_dir()

                        close_elem.click()

                        a_elems = elem.find_elements_by_xpath(".//a")

                        for a_elem in a_elems:
                            if '/ebook_download/' not in a_elem.get_attribute(
                                    'href'
                            ) and '/code_download/' not in a_elem.get_attribute(
                                    'href'):
                                continue

                            logger.info(
                                'Clicking on link',
                                dict(href=a_elem.get_attribute('href')))

                            a_elem.click()
                            time.sleep(1.5)

                        logger.debug(
                            'Wait 2 seconds before checking if there is a download'
                        )
                        time.sleep(2)
                        self._wait_for_dl()
                        self._move_dl_file_to(title)

                        with open(
                                self._full_path_for_title(title, 'price.txt'),
                                'wt') as f:
                            f.writelines([price_text])

                        with open(
                                self._full_path_for_title(title, 'cover.png'),
                                'wb') as f:
                            with urllib.request.urlopen(
                                    book_picture) as response:
                                f.write(response.read())

                        close_elem.click()
                        logger.debug(
                            'Wait 2 seconds before loading next book to lower server load'
                        )
                        time.sleep(2)

                    done = True
                except KeyboardInterrupt:
                    raise
                except Exception as exc:
                    einfo = sys.exc_info()
                    tb = traceback.extract_tb(einfo[2])
                    tb_str = 'An exception occured:\n  Type: %s\n  Message: %s\n\n' % \
                             (einfo[0].__name__, einfo[1])
                    for t in tb:
                        tb_str += '  File %s:%s in %s\n    %s\n' % (
                            t.filename, t.lineno, t.name, t.line)

                    for line in tb_str.splitlines(keepends=False):
                        logger.debug(line)

                    logger.error('{}'.format(exc))
                    time.sleep(1)
Пример #7
0
 def _wait_for_dl(self):
     while self.has_unfinished_download():
         logger.debug('Waiting to finish download')
         time.sleep(3)