def getChapterUrls(self): feedback.debug("domain: " + str(self.domain)) doc = self.getDomObject() obj_a = doc.cssselect("a") urls = [] for item in obj_a: if not "href" in item.attrib.keys(): continue m = re.match( """(//%s/manga/%s/[^"]+)""" % (self.domain, self.name), item.attrib["href"]) if not m: continue target_url = "http:" + m.group(1) if not target_url in urls: urls.append(target_url) if len(urls) < 1: raise ComicEngine.ComicError("No URLs returned from %s" % self.url) util.naturalSort(urls, ".+/c([0-9.]+)/") # I've seen one series which was a load of "chapter 1" in different volumes... how to deal with that ? feedback.debug(urls) return urls
def load(self): """ Loads the page data if not yet already done. On 500-class errors, retries up to 3 times. class implementors should not need to call this method. """ if self.pagedata == None: global useragent retries = 3 while retries > 0: feedback.debug(self.url) try: req = urllib.request.Request( self.url, data=None, headers={'User-Agent': useragent}) self.response = urllib.request.urlopen(req) self.pagedata = self.response.read() self.response.close() if self.pagedata != None: self.decompress() feedback.debug("Succesfully downloaded %s" % self.url) return else: raise ComicEngine.ComicError("No data obtained!") except ConnectionResetError as e: if retries > 0: print("Peer reset connection - retrying ...") retries -= 1 time.sleep(2) continue raise DownloadError( "Could not load %s\n%s" % (self.url, str(e)), self.url) except urllib.error.HTTPError as e: if httpCodeClass(e.code) == 500 and retries > 0: feedback.warn( "# HTTP %i error - retrying %i times ..." % (e.code, retries)) retries -= 1 time.sleep(2) continue if httpCodeClass(e.code) == 400: raise DownloadError("Request error: %i" % e.code, self.url, e.code) raise DownloadError( "Could not load %s\n%s" % (self.url, str(e)), self.url, e.code)
def getParentDir(path): """ Get the parent directory of a path Returns an absolute path """ abspath = os.path.abspath(path) i = abspath.rfind(os.path.sep) if i == 0: return "/" elif i < 0: raise ValueError("Nothing above root") feedback.debug("Containing dir: %s"%abspath[:i]) return abspath[:i]
def main(): global step_delay global ch_start global ch_end global dlstate global cbzdl_version print("cbzdl v.%s" % cbzdl_version) args = parseArguments() feedback.debug_mode = args.verbose checkSpecialCases(args.url) dlstate = state.DownloaderState(args.url) checkState(args) ch_start = args.start ch_end = args.end initializeState() try: cengine = dlstate.cengine comic_url = dlstate.get("url") if args.delay >= 0: step_delay = args.delay elif 'recommended_delay' in dir(cengine): step_delay = cengine.recommended_delay else: step_delay = 1 feedback.debug("Delay chosen: %i" % step_delay) failed = downloadComic(cengine, comic_url, args) except ComicEngine.ComicError as e: feedback.fail(str(e) ) if len(failed) > 0: feedback.error("Failed:") for chapter in failed: feedback.error("# %s"%chapter ) dlstate.set("failed_chapters", failed)
def getPageUrls(self): base_chapter_url = util.regexGroup("https://readms.net(/r/.+)/[0-9.]+$", self.url) feedback.debug("Base URL : "+base_chapter_url) dom = self.getDomObject() pageurls = [] links = dom.cssselect("ul.dropdown-menu li a") for elem_a in links: href = elem_a.attrib['href'] if re.match(base_chapter_url, href): pageurls.append("https://readms.net" + href) return pageurls
def __init__(self, stated_source): state_file_name = "state.data" feedback.debug("Source: %s" % stated_source) self.__state_data = None if os.path.isdir(stated_source): self.__state_file = os.path.sep.join( [stated_source, state_file_name]) self.load() if not self.has("url"): self.set("url", getOldSourceUrl(stated_source)) self.cengine = ComicEngine.determineFrom(self.get("url")) else: self.cengine = ComicEngine.determineFrom(stated_source) feedback.debug("Comic engine: %s" % self.cengine.__name__) comic_dir = self.cengine.Comic(stated_source).getComicLowerName() feedback.debug('Comic dir: %s' % comic_dir) self.__state_file = os.path.sep.join([comic_dir, state_file_name]) self.set("url", stated_source)
def getPageUrls(self): doc = self.getDomObject() image_nodes = doc.cssselect("img.fullsizable") page_urls = [] # All pages are in one page - encode them and stuff them in a bogus query string i = 1 for img in image_nodes: imgurl = img.attrib['src'] feedback.debug(imgurl) pagenum = i i += 1 if re.match(".+/nextchap.png", imgurl): continue page_urls.append( "%s?u=%s&n=%s" % (self.url, base64.urlsafe_b64encode( imgurl.encode("utf-8")).decode("utf-8"), pagenum)) return page_urls
def getPageUrls(self): document = self.getDomObject() child_nodes = document.get_element_by_id("vungdoc").getchildren() page_urls = [] # All pages are in one page - encode them and stuff them in a bogus query string i = 1 # counter... hopefully pages always come in-order...! for node in child_nodes: if node.tag != 'img': continue elif not 'src' in node.attrib.keys(): continue imgurl = node.attrib['src'] feedback.debug(imgurl) pagenum = i #util.regexGroup(".+?([0-9]+)\\.[a-z]+$", imgurl) i += 1 if re.match(".+/nextchap.png", imgurl): return None page_urls.append("%s?u=%s&n=%s"%(self.url , base64.urlsafe_b64encode(imgurl.encode("utf-8")).decode("utf-8"), pagenum) ) return page_urls
import filesys import feedback modules_dir = filesys.getParentDir(__file__) feedback.debug("Modules from %s" % modules_dir) module_files = filesys.listDir(modules_dir, "[a-zA-Z0-9]+.py$") feedback.debug("Got files %s"%module_files) engine_files = [] module_names = [] for i in range(len(module_files)): file_name = module_files[i] if file_name == "example_module.py" or file_name == "moduleslist.py": continue module_name = file_name[:-3] engine_files.append( "modules.%s" % module_name ) module_names.append( module_name ) # To be done: # https://www.manga.club
def downloadChapter(cengine, chapter_url, comic_dir): """ Kicks off the page downloads for a chapter Checks whether chapter number is within specified bounds On completion, if there were no page download errors, attempts CBZ creation Returns number of errors encountered """ feedback.debug("Start on %s ..."%chapter_url) global step_delay global ch_start global ch_end chapter = cengine.Chapter(chapter_url) chapter_num = float(chapter.getChapterNumber() ) if chapter_num < ch_start: return 0 elif chapter_num > ch_end: return 'max' # IF no start was specified THEN use the last success as base if ch_start == -1 and chapter_num <= dlstate.get("last"): return 0 feedback.info(" Get %s"%chapter_url) page_urls = chapter.getPageUrls() if page_urls == None: return ['%s not a valid chapter'%chapter_num] chapter_dir = os.path.sep.join([comic_dir, chapter.getChapterLowerName()]) feedback.info(" %i pages"%len(page_urls)) failed_urls = [] for url in page_urls: try: downloadPage(cengine, url, chapter_dir) except ComicEngine.ComicError as e: feedback.warn("Oops : %s"%str(e) ) failed_urls.append(url) except urllib.error.URLError as e: feedback.warn("Could not download %s"%url) failed_urls.append(url) except web.DownloadError as e: feedback.warn("%i : %s"%(e.code,str(e)) ) failed_urls.append(url) time.sleep(step_delay) if len(failed_urls) == 0: feedback.debug(" Compiling to CBZ ...") try: cbz.CBZArchive(chapter_dir).compile(remove_dir=True) dlstate.set("last", chapter_num) # Inequivocable success ! except Exception as e: feedback.warn( str(e) ) errors += 1 return failed_urls
def getChapterNumber(self): cnum = util.regexGroup("https://readms.net/r/[^/]+/([0-9.]+)", self.url) feedback.debug("Return chapter number: %s" % cnum) return cnum
def getComicLowerName(self): res = util.regexGroup("https://readms.net/manga/([^/]+)", self.url) feedback.debug(": "+res) return res
def __init__(self, url): ComicSite.__init__(self, re.sub("/r/([^/]+).*", "/manga/\\1", url) ) feedback.debug(self.url) self.name = self.getComicLowerName()