示例#1
0
 def download(self):
     if not self.available():
         raise exceptions.ScrapingError
     base_url = _make_api_request("/at-home/server/" +
                                  self.json["id"]).json()["baseUrl"]
     pages = [
         base_url + "/data/" + self.json["attributes"]["hash"] + "/" + x
         for x in self.json["attributes"]["data"]
     ]
     if len(pages) <= 0:
         output.error("{}: chapter is hosted externally".format(self.alias))
         raise exceptions.ScrapingError("external")
     files = [None] * len(pages)
     futures = []
     with self.progress_bar(pages) as bar:
         for i, page in enumerate(pages):
             r = self.req_session.get(page, stream=True)
             if not r or r.status_code == 404:
                 output.error("{}: failed request for page {}".format(
                     self.alias, i))
                 raise exceptions.ScrapingError
             fut = download_pool.submit(self.page_download_task,
                                        i,
                                        r,
                                        page_url=page)
             fut.add_done_callback(
                 partial(self.page_download_finish, bar, files))
             futures.append(fut)
         concurrent.futures.wait(futures)
         self.create_zip(files)
示例#2
0
def _make_api_request(url, session=None, extra_headers={}):
    while True:
        if debug:
            output.warning("Mangadex API: requesting -> " + url)
        try:
            if session:
                r = session.get('https://api.mangadex.org/' + url.strip('/'),
                                headers={
                                    **MangadexV5Series.headers,
                                    **extra_headers
                                })
            else:
                r = requests.get('https://api.mangadex.org/' + url.strip('/'),
                                 headers={
                                     **MangadexV5Series.headers,
                                     **extra_headers
                                 })
        except requests.exceptions.ConnectionError:
            output.error(
                "Mangadex API: request to endpoint failed: {}".format(url))
            raise exceptions.ScrapingError
        if r.status_code == 200:
            return r
        elif r.status_code == 429:
            retry_delay = int(r.headers["retry-after"])
            output.warning(
                "Mangadex API: wait {} seconds...".format(retry_delay))
            time.sleep(retry_delay)
        else:
            output.error("Mangadex API: got bad status code {}".format(
                r.status_code))
            return r
示例#3
0
def edit(alias, setting, value):
    """Modify settings for a follow.

    The following settings can be edited: alias, directory.
    """
    series = db.Series.alias_lookup(alias, unfollowed=True)
    alias = series.alias
    if value.lower() == 'none' or value.lower() == '-':
        value = None
    if setting == 'alias':
        series.alias = value
    elif setting == 'directory':
        series.directory = value
    else:
        setting = click.style(setting, bold=True)
        output.error('Invalid setting {}'.format(setting))
        exit(1)

    if not value:
        value = click.style('none', dim=True)
    else:
        value = click.style(value, bold=True)
    try:
        db.session.commit()
    except exceptions.DatabaseIntegrityError:
        db.session.rollback()
        output.error('Illegal value {}'.format(value))
        exit(1)
    else:
        output.chapter('Changed {} for {} to {}'.format(setting, alias, value))
示例#4
0
 def test_chapter_download_latest(self):
     latest_releases = self.get_five_latest_releases()
     for release in latest_releases:
         try:
             chapter = mangahere.MangahereChapter.from_url(release)
         except exceptions.ScrapingError as e:
             output.error('Scraping error for {} - {}'.format(release, e))
             raise exceptions.ScrapingError
         else:
             chapter.get(use_db=False)
示例#5
0
def test_database():
    """Runs a database sanity test."""
    sanity_tester = sanity.DatabaseSanity(Base, engine)
    sanity_tester.test()
    if sanity_tester.errors:
        for error in sanity_tester.errors:
            err_target, err_msg = str(error).split(' ', 1)
            message = ' '.join([click.style(err_target, bold=True), err_msg])
            output.warning(message)
        output.error('Database has failed sanity check; '
                     'run `cu2 repair-db` to repair database')
        exit(1)
示例#6
0
 def alias_lookup(alias, unfollowed=False):
     """Returns a DB object for a series by alias name. Prints an error if
     an invalid alias is specified.
     """
     filters = {'alias': alias}
     if not unfollowed:
         filters['following'] = not unfollowed
     try:
         s = session.query(Series).filter_by(**filters).one()
     except NoResultFound:
         output.error('Could not find alias "{}"'.format(alias))
         exit(1)
     else:
         return s
示例#7
0
 def page_download_task(page_num, r, page_url=None):
     """Saves the response body of a single request, returning the file
     handle and the passed through number of the page to allow for non-
     sequential downloads in parallel.
     """
     ext = BaseChapter.guess_extension(r.headers.get('content-type'))
     f = NamedTemporaryFile(suffix=ext, delete=False)
     retries = 20
     while retries > 0:
         try:
             for chunk in r.iter_content(chunk_size=4096):
                 if chunk:
                     f.write(chunk)
             retries = 0
         # basically ignores this exception that requests throws.  my
         # understanding is that it is raised when you attempt to iter_content()
         # over the same content twice.  don't understand how that situation
         # arises with the current code but it did somehow.
         # https://stackoverflow.com/questions/45379903/
         except requests.exceptions.StreamConsumedError:
             pass
         # when under heavy load, Mangadex will often kill the connection in
         # the middle of an image download.  in the original architecture,
         # the requests are all opened in the scrapers in stream mode, then
         # the actual image payloads are downloaded in the asynchronous
         # callbacks.  when this occurs we have not choice but to re-request
         # the image from the beginning (easier than playing around with range
         # headers).  this means each thread may issue multiple new requests.
         # I have found the performance overhead to be mostly negligible.
         except requests.exceptions.ChunkedEncodingError:
             if not page_url:
                 output.error(
                     "Connection killed on page {} but scraper does not support retries"
                     .format(str(page_num)))
                 raise exceptions.ScrapingError
             output.warning(
                 "Connection killed on page {}, {} retries remaining".
                 format(str(page_num), str(retries)))
             retries = retries - 1
             if retries <= 0:
                 output.error(
                     "Connection killed on page {}, no retries remaining - aborting chapter"
                     .format(str(page_num)))
                 raise exceptions.ScrapingError
             r = self.req_session.get(page_url, stream=True)
     f.flush()
     f.close()
     r.close()
     return ((page_num, f))
示例#8
0
 def test_chapter_download_latest(self):
     latest_releases = self.get_five_latest_releases()
     for release in latest_releases:
         try:
             chapter = mangadex_v5.MangadexV5Chapter.from_url(release)
         except exceptions.ScrapingError as e:
             output.error('Scraping error for {} - {}'.format(release, e))
             raise exceptions.ScrapingError
         else:
             try:
                 chapter.get(use_db=False)
             except exceptions.ScrapingError as e:
                 if e.message == "external":
                     continue
                 raise e
示例#9
0
 def _get_page(self, url):
     if len(url.rstrip('/').split('/')) == 7:
         manga_id = MangadexV5Series._translate_manga_id(
             url.rstrip('/').split('/')[-3])
     elif len(url.rstrip('/').split('/')) == 6:
         manga_id = MangadexV5Series._translate_manga_id(
             url.rstrip('/').split('/')[-2])
     elif url.rstrip('/').split('/')[-1].isdigit():
         manga_id = MangadexV5Series._translate_manga_id(
             url.rstrip('/').split('/')[-1])
     else:
         manga_id = url.rstrip('/').split('/')[-1]
     r = _make_api_request('/manga/' + manga_id, session=self.req_session)
     # this bit is duplicated in _decode_json because at this point we don't have
     # enough data from the API to call self.alias
     try:
         self.json = json.loads(r.text)
     except json.decoder.JSONDecodeError:
         output.error("Mangadex API: failed to decode JSON response")
         raise exceptions.ScrapingError
示例#10
0
def _decode_json(string):
    try:
        try:
            return json.loads(string)["data"]
        except json.decoder.JSONDecodeError:
            output.error(self.alias +
                         ": Mangadex API: failed to decode JSON response")
            raise exceptions.ScrapingError
        except KeyError:
            output.error(self.alias +
                         ": Mangadex API: request returned status: " +
                         json.loads(string)["result"])
            raise exceptions.ScrapingError
    except NameError:
        output.error("Mangadex API: request returned status: " +
                     json.loads(string)["result"])
示例#11
0
    def download(self):
        if not getattr(self, "cpage", None):
            self.cpage = self.req_session.get(self.url.replace("m.", "www."),
                                              headers=chrome_headers)
            if self.cpage.status_code == 404:
                raise exceptions.ScrapingError

        if not getattr(self, "soup", None):
            self.soup = BeautifulSoup(self.cpage.text,
                                      config.get().html_parser)

        pages = []
        (mid, cid) = (None, None)
        # index of script with ids may vary
        # it may also change as ads are added/removed from the site
        for f in range(0, len(self.soup.find_all("script"))):
            try:
                if len(self.soup.find_all("script")[f].contents):
                    mid = re.search(
                        "var comicid = ([0-9]+)",
                        self.soup.find_all("script")
                        [f].contents[0]).groups()[0]
                    cid = re.search(
                        "var chapterid =([0-9]+)",
                        self.soup.find_all("script")
                        [f].contents[0]).groups()[0]
            except AttributeError:
                pass
        if mid and cid:
            old_num_pages = -1
            while old_num_pages != len(pages):
                old_num_pages = len(pages)
                pages = self._request_pages(mid, cid, pages)
        else:
            # some titles (seems to be ones with low page counts like webtoons)
            # don't use progressively-loaded pages.  for these, the image list
            # can be extracted directly off the main page
            for g in range(0, len(self.soup.find_all("script"))):
                try:
                    pages = loads(
                        re.search(
                            "var newImgs = (.+);var newImginfos",
                            beautify(
                                self.soup.find_all("script")[g].text).replace(
                                    "\\", "").replace("'", "\"")).groups()[0])
                except AttributeError:
                    pass
            if not len(pages):
                raise exceptions.ScrapingError
            for i, page in enumerate(pages):
                pages[i] = "https:" + page

        futures = []
        files = [None] * len(pages)
        with self.progress_bar(pages) as bar:
            for i, page in enumerate(pages):
                retries = 0
                while retries < 10:
                    try:
                        r = self.req_session.get(page, stream=True)
                        break
                    except requests.exceptions.ConnectionError:
                        retries += 1
                # end of chapter detection in the web ui is done by issuing requests
                # for nonexistent pages which return 404s (who comes up with this)
                if r.status_code != 404:
                    if r.status_code != 200:
                        r.close()
                        output.error("Page download got status code {}".format(
                            str(r.status_code)))
                        raise exceptions.ScrapingError
                    fut = download_pool.submit(self.page_download_task, i, r)
                    fut.add_done_callback(
                        partial(self.page_download_finish, bar, files))
                    futures.append(fut)
                else:
                    try:
                        del files[i]
                    except IndexError:
                        raise exceptions.ScrapingError
            concurrent.futures.wait(futures)
            self.create_zip(files)
示例#12
0
def config_command(mode, setting, value):
    """Get or set configuration options.

    Mode can be either "get" or "set", depending on whether you want to read or
    write configuration values. If mode is "get", you can specify a setting to
    read that particular setting or omit it to list out all the settings. If
    mode is "set", you must specify the setting to change and assign it a new
    value.
    """
    if mode == 'get':
        if setting:
            parameters = setting.split('.')
            value = config.get()
            for parameter in parameters:
                try:
                    value = getattr(value, parameter)
                except AttributeError:
                    output.error('Setting not found')
                    exit(1)
            output.configuration({setting: value})
        else:
            configuration = config.get().serialize()
            output.configuration(configuration)
    elif mode == 'set':
        if setting is None:
            output.error('You must specify a setting')
            exit(1)
        if value is None:
            output.error('You must specify a value')
            exit(1)
        parameters = setting.split('.')
        preference = config.get()
        for parameter in parameters[0:-1]:
            try:
                preference = getattr(preference, parameter)
            except AttributeError:
                output.error('Setting not found')
                exit(1)
        try:
            current_value = getattr(preference, parameters[-1])
        except AttributeError:
            output.error('Setting not found')
            exit(1)
        if current_value is not None:
            if isinstance(current_value, bool):
                if value.lower() == 'false' or value == 0:
                    value = False
                else:
                    value = True
            else:
                try:
                    value = type(current_value)(value)
                except ValueError:
                    output.error('Type mismatch: value should be {}'
                                 .format(type(current_value).__name__))
                    exit(1)
        setattr(preference, parameters[-1], value)
        config.get().write()
    else:
        output.error('Mode must be either get or set')
        exit(1)