class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.NHentai.Cl" pluginName = "NHentai Content Retreiver" tableKey = "nh" urlBase = "http://nhentai.net/" wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web") tableName = "HentaiItems" retreivalThreads = 6 shouldCanonize = False def getFileName(self, soup): title = soup.find("h1", class_="otitle") if not title: raise ValueError("Could not find title. Wat?") return title.get_text() def imageUrls(self, soup): thumbnailDiv = soup.find("div", id="thumbnail-container") ret = [] for link in thumbnailDiv.find_all("a", class_='gallerythumb'): referrer = urllib.parse.urljoin(self.urlBase, link['href']) if hasattr(link, "data-src"): thumbUrl = link.img['data-src'] else: thumbUrl = link.img['src'] if not "t." in thumbUrl[-6:]: raise ValueError("Url is not a thumb? = '%s'" % thumbUrl) else: imgUrl = thumbUrl[:-6] + thumbUrl[-6:].replace("t.", '.') imgUrl = urllib.parse.urljoin(self.urlBase, imgUrl) imgUrl = imgUrl.replace("//t.", "//i.") ret.append((imgUrl, referrer)) return ret def getDownloadInfo(self, linkDict, retag=False): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) try: soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': self.urlBase}) except: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") linkDict['dirPath'] = os.path.join(settings.nhSettings["dlDir"], linkDict['seriesName']) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) #self.log.info(os.path.join()) imageUrls = self.imageUrls(soup) # print("Image URLS: ", imageUrls) linkDict["dlLinks"] = imageUrls self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) return linkDict def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0) return fileN, content def fetchImages(self, linkDict): images = [] for imgUrl, referrerUrl in linkDict["dlLinks"]: images.append(self.getImage(imgUrl, referrerUrl)) return images def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # self.log.info(len(content)) if images: fileN = linkDict['originName']+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. chop = len(fileN)-4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop]+fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, pron=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False def getLink(self, link): try: self.updateDbEntry(link["sourceUrl"], dlState=1) linkInfo = self.getDownloadInfo(link) self.doDownload(linkInfo) except IOError: self.log.error("Failure retreiving content for link %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(link["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: MISSING") except urllib.error.URLError: self.log.error("Failure retreiving content for link %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(link["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED")
class Loader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): loggerPath = "Main.Manga.Yo.Fl" pluginName = "YoManga Scans Link Retreiver" tableKey = "ym" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://yomanga.co/" seriesBase = "http://yomanga.co/reader/directory/%s/" def closeDB(self): self.log.info("Closing DB...", ) self.conn.close() self.log.info("done") def doDownload(self, seriesName, dlurl, chapter_name): row = self.getRowsByValue(sourceUrl=dlurl, limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime=time.time(), sourceUrl=dlurl, originName=seriesName, dlState=1, seriesName=seriesName) fctnt, fname = self.wg.getFileAndName(dlurl) fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName, chap=chapter_name) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) wholePath = os.path.join(dlPath, fileN) self.log.info("Source name: %s", fname) self.log.info("Generated name: %s", fileN) if newDir: self.updateDbEntry(dlurl, flags="haddir") self.conn.commit() with open(wholePath, "wb") as fp: fp.write(fctnt) self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(seriesName, wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=dlurl, tags=dedupState) self.updateDbEntry(dlurl, dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit() def getContentForItem(self, url): new = 0 total = 0 soup = self.wg.getSoup(url) stitle = soup.find("h1", class_='title').get_text().strip() chapters = soup.find_all("div", class_='element') for chapter in chapters: dlurl = chapter.find("div", class_='fleft') chp_name = chapter.find("div", class_="title").get_text().strip() wasnew = self.doDownload(stitle, dlurl.a['href'], chp_name) if wasnew: new += 1 total += 1 return new, total def getSeriesUrls(self): ret = set() self.wg.stepThroughCloudFlare(self.seriesBase % 1, titleContains='Series List') page = 1 while True: soup = self.wg.getSoup(self.seriesBase % page) new = False rows = soup.find_all('div', class_='group') for row in rows: if row.a['href'] not in ret: new = True ret.add(row.a['href']) page += 1 if not new: break self.log.info("Found %s series", len(ret)) return ret def getAllItems(self): self.log.info("Loading YoManga Items") seriesPages = self.getSeriesUrls() tot_new, total_overall = 0, 0 for item in seriesPages: new, total = self.getContentForItem(item) tot_new += new total_overall += total self.log.info("Found %s total items, %s of which were new", total_overall, tot_new) return [] def go(self): self.resetStuckItems() self.log.info("Getting feed items") feedItems = self.getAllItems() self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): loggerPath = "Main.Manga.Dy.Cl" pluginName = "Dynasty Scans Content Retreiver" tableKey = "dy" dbName = settings.DATABASE_DB_NAME tableName = "MangaItems" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") retreivalThreads = 3 urlBase = "http://dynasty-scans.com/" def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content) / 1000.0) return fileN, content def getImageUrls(self, inMarkup, baseUrl): pages = {} jsonRe = re.compile(r'var pages = (\[.*?\]);') pg = jsonRe.findall(inMarkup) if len(pg) != 1: self.log.error("Erroring page '%s'", baseUrl) raise ValueError("Page has more then one json section?") images = json.loads(pg.pop()) for item in images: imgurl = urllib.parse.urljoin(baseUrl, item['image']) pages[imgurl] = baseUrl self.log.info("Found %s pages", len(pages)) return pages def getSeries(self, markup): soup = bs4.BeautifulSoup(markup) title = soup.find("h3", id='chapter-title') if title.b.find('a'): title = title.b.a.get_text() else: title = title.b.get_text() title = nt.getCanonicalMangaUpdatesName(title) print("Title '%s'" % title) return title def getLink(self, link): sourceUrl = link["sourceUrl"] chapterVol = link["originName"] inMarkup = self.wg.getpage(sourceUrl) seriesName = self.getSeries(inMarkup) try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(inMarkup, sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, chapterVol, len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName + " [DynastyScans].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl, referrerUrl in imageUrls.items(): imageName, imageContent = self.getImage(imgUrl, referrerUrl) images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.Hitomi.Cl" pluginName = "Hitomi Content Retreiver" tableKey = "hit" urlBase = "https://hitomi.la/" wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web") tableName = "HentaiItems" retreivalThreads = 3 def getFileName(self, soup): title = soup.find("h1") if not title: raise ValueError("Could not find title. Wat?") return title.get_text().strip() def imageUrls(self, soup): thumbnailDiv = soup.find("div", id="thumbnail-container") ret = [] for link in thumbnailDiv.find_all("a", class_='gallerythumb'): referrer = urllib.parse.urljoin(self.urlBase, link['href']) if hasattr(link, "data-src"): thumbUrl = link.img['data-src'] else: thumbUrl = link.img['src'] if not "t." in thumbUrl[-6:]: raise ValueError("Url is not a thumb? = '%s'" % thumbUrl) else: imgUrl = thumbUrl[:-6] + thumbUrl[-6:].replace("t.", '.') imgUrl = urllib.parse.urljoin(self.urlBase, imgUrl) imgUrl = imgUrl.replace("//t.", "//i.") ret.append((imgUrl, referrer)) return ret def format_tag(self, tag_raw): if "♀" in tag_raw: tag_raw = tag_raw.replace("♀", "") tag_raw = "female " + tag_raw if "♂" in tag_raw: tag_raw = tag_raw.replace("♂", "") tag_raw = "male " + tag_raw tag = tag_raw.strip() while " " in tag: tag = tag.replace(" ", " ") tag = tag.strip().replace(" ", "-") return tag.lower() def getCategoryTags(self, soup): tablediv = soup.find("div", class_='gallery-info') tagTable = soup.find("table") tags = [] formatters = { "series" : "parody", "characters" : "characters", "tags" : "", } ignoreTags = [ "type", ] print("soup.h2", ) category = "Unknown?" for tr in tagTable.find_all("tr"): if len(tr.find_all("td")) != 2: continue what, values = tr.find_all("td") what = what.get_text().strip().lower() if what in ignoreTags: continue elif what == "Type": category = values.get_text().strip() if category == "Manga One-shot": category = "=0= One-Shot" elif what == "language": lang_tag = values.get_text().strip() lang_tag = self.format_tag("language " + lang_tag) tags.append(lang_tag) elif what in formatters: for li in values.find_all("li"): tag = " ".join([formatters[what], li.get_text()]) tag = self.format_tag(tag) tags.append(tag) artist_str = "unknown artist" for artist in soup.h2("li"): artist_str = artist.get_text() atag = "artist " + artist_str atag = self.format_tag(atag) tags.append(atag) print(category, tags) return category, tags, artist_str def getDownloadInfo(self, linkDict): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) # self.updateDbEntry(linkDict["sourceUrl"], dlState=1) soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': 'https://hitomi.la/'}) if not soup: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") gal_section = soup.find("div", class_='gallery') category, tags, artist = self.getCategoryTags(gal_section) tags = ' '.join(tags) linkDict['artist'] = artist linkDict['title'] = self.getFileName(gal_section) linkDict['dirPath'] = os.path.join(settings.hitSettings["dlDir"], nt.makeFilenameSafe(category)) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) if tags: self.log.info("Adding tag info %s", tags) self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags) read_url = soup.find("a", text=re.compile("Read Online", re.IGNORECASE)) spage = urllib.parse.urljoin(self.urlBase, read_url['href']) linkDict["spage"] = spage self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, lastUpdate=time.time()) return linkDict def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0) return fileN, content def getImages(self, linkDict): print("getImage", linkDict) soup = self.wg.getSoup(linkDict['spage'], addlHeaders={'Referer': linkDict["sourceUrl"]}) raw_imgs = soup.find_all('div', class_="img-url") imageurls = [] for div in raw_imgs: imgurl = div.get_text().strip() imgurl = re.sub(r"\/\/..?\.hitomi\.la\/", r'https://la.hitomi.la/', imgurl, flags=re.IGNORECASE) imageurls.append((imgurl, linkDict['spage'])) if not imageurls: return [] images = [] for imageurl, referrer in imageurls: images.append(self.getImage(imageurl, referrer)) return images def getLink(self, linkDict): try: linkDict = self.getDownloadInfo(linkDict) images = self.getImages(linkDict) title = linkDict['title'] artist = linkDict['artist'] except webFunctions.ContentError: self.updateDbEntry(linkDict["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: FAILED") return False if images and title: fileN = title+" "+artist+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. try: arch = zipfile.ZipFile(wholePath, "w") except OSError: title = title.encode('ascii','ignore').decode('ascii') fileN = title+".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=linkDict['dbId']) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False def setup(self): self.wg.stepThroughCloudFlare(self.urlBase, titleContains="Hitomi")
class FeedLoader(ScrapePlugins.LoaderBase.LoaderBase): loggerPath = "Main.Manga.Mc.Fl" pluginName = "MangaCow Link Retreiver" tableKey = "mc" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://mngcow.co/" feedUrl = "http://mngcow.co/manga-list/" def extractItemInfo(self, soup): ret = {} container = soup.find("div", class_="mng_ifo") infoDiv = container.find("div", class_="det") titleDiv = infoDiv.find("h4") ret["title"] = titleDiv.get_text() items = infoDiv.find_all("p") ret["note"] = " ".join( items[0].strings ) # Messy hack to replace <br> tags with a ' ', rather then just removing them. # And clean out the non-breaking spaces ret["note"] = ret["note"].replace(chr(0xa0), ' ') for item in items: text = item.get_text().strip() if not ":" in text: continue what, text = text.split(":", 1) if what == "Category": tags = [tag_link.get_text() for tag_link in item.find_all("a")] tags = [tag.lower().strip().replace(" ", "-") for tag in tags] ret["tags"] = " ".join(tags) return ret def getItemPages(self, url): # print("Should get item for ", url) page = self.wg.getpage(url) soup = bs4.BeautifulSoup(page, "lxml") baseInfo = self.extractItemInfo(soup) ret = [] for link in soup.find_all("a", class_="lst"): item = {} url = link["href"] chapTitle = link.find("b", class_="val") chapTitle = chapTitle.get_text() chapDate = link.find("b", class_="dte") date = dateutil.parser.parse(chapDate.get_text(), fuzzy=True) item["originName"] = "{series} - {file}".format( series=baseInfo["title"], file=chapTitle) item["sourceUrl"] = url item["seriesName"] = baseInfo["title"] item["tags"] = baseInfo["tags"] item["note"] = baseInfo["note"] item["retreivalTime"] = calendar.timegm(date.timetuple()) ret.append(item) return ret def getSeriesUrls(self): ret = [] print("wat?") page = self.wg.getpage(self.feedUrl) soup = bs4.BeautifulSoup(page, "lxml") divs = soup.find_all("div", class_="img_wrp") for div in divs: url = div.a["href"] ret.append(url) return ret def getFeed(self): # for item in items: # self.log.info( item) # self.log.info("Loading Mc Items") ret = [] seriesPages = self.getSeriesUrls() for item in seriesPages: itemList = self.getItemPages(item) for item in itemList: ret.append(item) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break self.log.info("Found %s total items", len(ret)) return ret
class DbLoader(ScrapePlugins.LoaderBase.LoaderBase): loggerPath = "Main.Manga.Bt.Fl" pluginName = "Batoto Link Retreiver" tableKey = "bt" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://www.bato.to/" feedUrl = "http://www.bato.to/?p=%d" def parseDateStr(self, inStr): # For strings like "n Days Ago", split out the "n", convert it to an int, and take the # time-delta so we know what actual date it refers to. # convert instances of "a minute ago" to "1 minute ago", for mins, hours, etc... inStr = inStr.strip() if inStr.lower().startswith("an"): inStr = "1" + inStr[2:] if inStr.lower().startswith("a"): inStr = "1" + inStr[1:] if "just now" in inStr: updateDate = datetime.datetime.now() elif "months ago" in inStr or "month ago" in inStr: monthsAgo = inStr.split()[0] monthsAgo = int(monthsAgo) updateDate = datetime.datetime.now() - datetime.timedelta( monthsAgo * 7) elif "weeks ago" in inStr or "week ago" in inStr: weeksAgo = inStr.split()[0] weeksAgo = int(weeksAgo) updateDate = datetime.datetime.now() - datetime.timedelta( weeksAgo * 7) elif "days ago" in inStr or "day ago" in inStr: daysAgo = inStr.split()[0] daysAgo = int(daysAgo) updateDate = datetime.datetime.now() - datetime.timedelta(daysAgo) elif "hours ago" in inStr or "hour ago" in inStr: hoursAgo = inStr.split()[0] hoursAgo = int(hoursAgo) updateDate = datetime.datetime.now() - datetime.timedelta( 0, hoursAgo * 60 * 60) elif "minutes ago" in inStr or "minute ago" in inStr: minutesAgo = inStr.split()[0] minutesAgo = int(minutesAgo) updateDate = datetime.datetime.now() - datetime.timedelta( 0, minutesAgo * 60) elif "seconds ago" in inStr or "second ago" in inStr: secondsAgo = inStr.split()[0] secondsAgo = int(secondsAgo) updateDate = datetime.datetime.now() - datetime.timedelta( 0, secondsAgo) else: # self.log.warning("Date parsing failed. Using fall-back parser") updateDate = dateutil.parser.parse(inStr, fuzzy=True) # self.log.warning("Failing string = '%s'", inStr) # self.log.warning("As parsed = '%s'", updateDate) return updateDate def getItemFromSeriesPageContainer(self, row): cells = row.find_all("td") if len(cells) != 5: # self.log.error("Invalid number of TD items in row!") return None chapter, lang, dummy_scanlator, dummy_uploader, uploadDate = cells # Skip uploads in other languages if DOWNLOAD_ONLY_LANGUAGE and not DOWNLOAD_ONLY_LANGUAGE in str(lang): return None dateStr = uploadDate.get_text().strip() addDate = self.parseDateStr(dateStr) item = {} item["retreivalTime"] = calendar.timegm(addDate.timetuple()) item["sourceUrl"] = chapter.a["href"] if not "http://bato.to/reader#" in item["sourceUrl"]: return False return item def fetchItemsForSeries(self, seriesUrl, historical): # for item in items: # self.log.info( item) # self.log.info("Loading items from '%s'", seriesUrl) soup = self.wg.getSoup(seriesUrl) # Find the divs containing either new files, or the day a file was uploaded itemRows = soup.find_all("tr", class_=re.compile("chapter_row")) items = 0 newItems = 0 ret = [] for itemRow in itemRows: item = self.getItemFromSeriesPageContainer(itemRow) if item: items += 1 # Only fetch an item if it's less then 48 hours old, or we're running # in historical mode (which means fetch all the things) # if item["retreivalTime"] > (time.time() - 60*60*48) or historical: # Fukkit, just grab everything. if True: newItems += 1 ret.append(item) self.log.info("Found %s of %s items recent enough to download for %s.", newItems, items, seriesUrl) return ret def getItemsFromSeriesUrls(self, seriesItems, historical): ret = [] self.log.info("Have %s items to fetch data for.", len(seriesItems)) with ThreadPoolExecutor(max_workers=2) as executor: tmp = [] for seriesUrl in seriesItems: tmp.append( executor.submit(self.fetchItemsForSeries, seriesUrl, historical)) for future in tmp: # items = self.fetchItemsForSeries(seriesUrl, historical) items = future.result() for item in items: ret.append(item) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break return ret def getSeriesUrl(self, row): cells = row.find_all("td") if len(cells) == 2: return cells.pop(0).a['href'] return None def getFeed(self, rangeOverride=None, rangeOffset=None, historical=False): # for item in items: # self.log.info( item) # self.log.info("Loading BT Main Feed") seriesPages = [] if not rangeOverride: dayDelta = 2 else: dayDelta = int(rangeOverride) if not rangeOffset: rangeOffset = 0 seriesPages = set() for daysAgo in range(1, dayDelta + 1): url = self.feedUrl % (daysAgo + rangeOffset) page = self.wg.getpage(url) soup = bs4.BeautifulSoup(page, "lxml") # Find the divs containing either new files, or the day a file was uploaded itemRow = soup.find_all("tr", class_=re.compile("row[01]")) for row in itemRow: item = self.getSeriesUrl(row) if item: seriesPages.add(item) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break ret = self.getItemsFromSeriesUrls(seriesPages, historical) return ret
class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): loggerPath = "Main.Manga.Sura.Fl" pluginName = "Sura's Place Link Retreiver" tableKey = "sura" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://www.surasplace.com/" feedUrl = "http://www.surasplace.com/index.php/projects/popular/page{num}.html" def getSeriesPages(self): page = 1 links = set() hadNew = True while (hadNew): hadNew = False url = self.feedUrl.format(num=page) soup = self.wg.getSoup(url) divs = soup.find_all("div", class_='lsrow') for div in divs: header = div.find("div", class_='header') if not header.find("span", itemprop='name'): continue itemUrl = header.h3.a['href'] itemName = header.h3.a.span.get_text() fullUrl = urllib.parse.urljoin(self.urlBase, itemUrl) # Apparently content is added manually, leading to some broken URLs. # anyways, fix those as they crop up. if fullUrl.endswith("htmll"): fullUrl = fullUrl[:-1] for x in range(len(fullUrl)): if fullUrl[x:] == fullUrl[:x]: fullUrl = fullUrl[x:] break if not fullUrl in links: links.add(fullUrl) hadNew |= True page += 1 self.log.info("Found %s series-like items.", len(links)) return links def extractItemInfo(self, soup): ret = {} titleDiv = soup.find("span", itemprop="name") ret["title"] = titleDiv.get_text() # Holy shit, unique IDs for each metadata field. Halle-f*****g-lujah tags = soup.find("div", id='field_28') tagitems = [] if tags: for item in tags.find_all("a", class_='tag'): tag = item.get_text().strip() while " " in tag: tag = tag.replace(" ", " ") tag = tag.replace(" ", "-").lower() # print("Text:", tag) tagitems.append(tag) ret["tags"] = " ".join(tagitems) return ret def getItemPages(self, url): soup = self.wg.getSoup( url.strip(), addlHeaders={ 'Referer': 'http://www.surasplace.com/index.php/projects.html' }) baseInfo = self.extractItemInfo(soup) ret = [] contents = soup.find("div", class_='listing-desc') items = contents.find_all("td") for link in items: if not link.a: continue # print(link) item = {} item["sourceUrl"] = link.a["href"].strip() item["seriesName"] = baseInfo["title"] item["tags"] = baseInfo["tags"] item["retreivalTime"] = time.time() ret.append(item) return ret def getAllItems(self): # for item in items: # self.log.info( item) # self.log.info("Loading Mc Items") ret = [] seriesPages = self.getSeriesPages() for itemUrl in seriesPages: itemList = self.getItemPages(itemUrl) for itemUrl in itemList: ret.append(itemUrl) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break self.log.info("Found %s total items", len(ret)) return ret def go(self): self.resetStuckItems() self.log.info("Getting feed items") feedItems = self.getAllItems() self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class Scrape(TextScrape.SiteArchiver.SiteArchiver): tableKey = 'japtem' loggerPath = 'Main.Text.JapTem.Scrape' pluginName = 'JapTemScrape' wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") threads = 4 feeds = ['http://japtem.com/feed/'] baseUrl = [ "http://japtem.com/", "http://www.japtem.com/", ] startUrl = baseUrl badwords = [ "/viewtopic.php", "/memberlist.php", "/search.php", "/wp-content/plugins/", "/styles/prosilver/theme/", "/forums/", "/forum/", "/cdn-cgi/", "/help/", "?share=", "?popup=", "viewforum.php", "/wp-login.php", "/#comments", # Ignore in-page anchor tags "/staff/" ] positive_keywords = ['main_content'] negative_keywords = [ 'mw-normal-catlinks', "printfooter", "mw-panel", 'portal' ] decomposeBefore = [ { 'id': 'disqus_thread' }, ] decompose = [ { 'class': 'slider-container' }, { 'class': 'secondarymenu-container' }, { 'class': 'mainmenu-container' }, { 'class': 'mobile-menu' }, { 'class': 'footer' }, { 'class': 'sidebar' }, { 'class': 'disqus_thread' }, { 'class': 'sharedaddy' }, { 'class': 'pagination' }, { 'class': 'scrollUp' }, { 'id': 'slider-container' }, { 'id': 'secondarymenu-container' }, { 'id': 'mainmenu-container' }, { 'id': 'mobile-menu' }, { 'id': 'footer' }, { 'id': 'sidebar' }, { 'id': 'disqus_thread' }, { 'id': 'sharedaddy' }, { 'id': 'scrollUp' }, ]
class JzFeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): loggerPath = "Main.Manga.Jz.Fl" pluginName = "Japanzai Link Retreiver" tableKey = "jz" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") urlBase = "http://download.japanzai.com/" tableName = "MangaItems" def checkLogin(self): pass def closeDB(self): self.log.info("Closing DB...", ) self.conn.close() self.log.info("done") def quoteUrl(self, url): # print("InUrl = '%s'" % url) scheme, netloc, path, params, query, fragment = urllib.parse.urlparse( url) # print((scheme, netloc, path, params, query, fragment)) path = urllib.parse.quote(path) params = urllib.parse.quote(params) query = urllib.parse.quote(query, safe="/=") fragment = urllib.parse.quote(fragment) # print((scheme, netloc, path, params, query, fragment)) url = urllib.parse.urlunparse( (scheme, netloc, path, params, query, fragment)) # print("outUrl = '%s'" % url) return url def getItemsFromContainer(self, seriesName, seriesUrl): self.log.info("Fetching items for series '%s'", seriesName) self.log.info("Using URL '%s'", seriesUrl) itemPage = self.wg.getpage(seriesUrl) soup = bs4.BeautifulSoup(itemPage, "lxml") linkLis = soup.find_all("li", class_="file") ret = [] for linkLi in linkLis: item = {} dlUrl = urllib.parse.urljoin(seriesUrl, self.quoteUrl(linkLi.a["href"])) item["retreivalTime"] = time.time() item["originName"] = linkLi.a.get_text().rsplit("-")[0].strip() item["sourceUrl"] = dlUrl item["seriesName"] = seriesName ret.append(item) moreDirs = self.getSeriesPages(soup, seriesUrl) return moreDirs, ret def getSeriesPages(self, soup, urlBase): linkLis = soup.find_all("li", class_="directory") ret = [] for linkLi in linkLis: series = linkLi.a.get_text() if series == "..": continue url = urllib.parse.urljoin(urlBase, self.quoteUrl(linkLi.a["href"])) ret.append((series, url)) if not runStatus.run: self.log.info("Breaking due to exit flag being set") return return ret def getMainItems(self): # for item in items: # self.log.info( item) # self.log.info("Loading Japanzai Main Feed") ret = [] basePage = self.wg.getpage(self.urlBase) soup = bs4.BeautifulSoup(basePage, "lxml") seriesPages = self.getSeriesPages(soup, self.urlBase) while len(seriesPages): seriesName, seriesUrl = seriesPages.pop() try: newDirs, newItems = self.getItemsFromContainer( seriesName, seriesUrl) for newDir in newDirs: seriesPages.append(newDir) for newItem in newItems: ret.append(newItem) except urllib.error.URLError: self.log.error("Failed to retreive page at url '%s'", seriesUrl) self.log.error(traceback.format_exc()) return ret def go(self): self.resetStuckItems() self.log.info("Getting feed items") feedItems = self.getMainItems() self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class TriggerLoader(ScrapePlugins.IrcGrabber.IrcQueueBase.IrcQueueBase): loggerPath = "Main.Manga.Iro.Fl" pluginName = "IrcOffer site Link Retreiver" tableKey = "irc-irh" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" feedUrls = [ ("http://blah.hawt.co/", "blahmanga"), ("http://stupidcommotion.net/index.php?group=*", "stupidcommotion"), ("http://stupidcommotion.net/torako.php?group=*", "stupidcommotion"), ] def getBot(self, packUrl, channel): server = "irchighway" self.log.info("Fetching page") soup = self.wg.getSoup(packUrl) self.log.info("Done. Searching") header = soup.h1.get_text().strip() botname = header.split()[0] # print("Header = ", header, "bot = ", botname) mainTable = soup.find("table", summary="list") ret = [] for row in mainTable.find_all("tr"): item = {} rowItems = row.find_all("td") if len(rowItems) == 4: pkgNum, dummy_dlcnt, size, info = rowItems item["pkgNum"] = pkgNum.get_text().strip("#").strip() item["server"] = server item["channel"] = channel sizeText = size.get_text().strip() # Skip all files that have sizes in bytes (just header files and shit) if "b" in sizeText.lower(): continue if "k" in sizeText.lower(): item["size"] = float( sizeText.lower().strip("k").strip()) / 1000.0 elif "g" in sizeText.lower(): item["size"] = float( sizeText.lower().strip("g").strip()) * 1000.0 else: item["size"] = float(sizeText.lower().strip("m").strip()) item["botName"] = botname if info.find("span", class_="selectable"): fname = info.find("span", class_="selectable").get_text().strip() elif info.find("a"): fname = info.a.get_text().strip().split(" ", 2)[-1] else: raise ValueError item["fName"] = fname # I'm using the filename+botname for the unique key to the database. itemKey = item["fName"] + item["botName"] # Skip video files. badExts = ['.mkv', '.mp4', '.avi', '.wmv'] if any( [item["fName"].endswith(skipType) for skipType in badExts]): # print("Skipping", item) continue # print(item) item = json.dumps(item) ret.append((itemKey, item)) # else: # print("Bad row? ", row) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break self.log.info("Found %s items", len(ret)) return ret def getMainItems(self): # for item in items: # self.log.info( item) # self.log.info("Loading IrcOffer Main Feeds") ret = [] for url, channel in self.feedUrls: ret += self.getBot(url, channel) self.log.info("All data loaded") return ret
class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): loggerPath = "Main.Manga.Mh.Fl" pluginName = "MangaHere Link Retreiver" tableKey = "mh" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://www.mangahere.co/" seriesBase = "http://www.mangahere.co/latest/" def closeDB(self): self.log.info("Closing DB...", ) self.conn.close() self.log.info("done") def getUpdatedSeries(self, url): ret = set() soup = self.wg.getSoup(url) if soup.find("div", class_='manga_updates'): mainDiv = soup.find("div", class_='manga_updates') else: raise ValueError("Could not find listing table?") for child in mainDiv.find_all("dl"): if child.dt: seriesUrl = urllib.parse.urljoin(self.urlBase, child.dt.a['href']) ret.add(seriesUrl) self.log.info("Found %s series", len(ret)) return ret def getUpdatedSeriesPages(self): # Historical stuff goes here, if wanted. self.log.info("Loading MangaHere Items") pages = self.getUpdatedSeries(self.seriesBase) self.log.info("Found %s total items", len(pages)) return pages # Check retreived page to see if it has a mature content warning # Step through if it does. # Returns page with actual content, either way def checkAdult(self, soup): adultPassThrough = soup.find("a", id='aYes') if not adultPassThrough: return soup self.log.info("Adult pass-through page. Stepping through") confirmLink = adultPassThrough['href'] return self.wg.getSoup(confirmLink) def getSeriesInfoFromSoup(self, soup): # Should probably extract tagging info here. Laaaaazy # MangaUpdates interface does a better job anyways. titleA = soup.find("h1", class_='title') return {"seriesName": titleA.get_text().title()} def getChaptersFromSeriesPage(self, soup): table = soup.find('div', class_='detail_list') items = [] for row in table.find_all("li"): if not row.a: continue # Skip the table header row chapter = row.find("span", class_='left') date = row.find("span", class_='right') item = {} # Name is formatted "{seriesName} {bunch of spaces}\n{chapterName}" # Clean up that mess to "{seriesName} - {chapterName}" name = chapter.get_text().strip() name = name.replace("\n", " - ") while " " in name: name = name.replace(" ", " ") item["originName"] = name item["sourceUrl"] = urllib.parse.urljoin(self.urlBase, chapter.a['href']) dateStr = date.get_text().strip() itemDate, status = parsedatetime.Calendar().parse(dateStr) if status != 1: continue item['retreivalTime'] = calendar.timegm(itemDate) items.append(item) return items def getChapterLinkFromSeriesPage(self, seriesUrl): ret = [] soup = self.wg.getSoup(seriesUrl) soup = self.checkAdult(soup) seriesInfo = self.getSeriesInfoFromSoup(soup) chapters = self.getChaptersFromSeriesPage(soup) for chapter in chapters: for key, val in seriesInfo.items( ): # Copy series info into each chapter chapter[key] = val ret.append(chapter) self.log.info("Found %s items on page for series '%s'", len(ret), seriesInfo['seriesName']) return ret def getAllItems(self): toScan = self.getUpdatedSeriesPages() ret = [] for url in toScan: items = self.getChapterLinkFromSeriesPage(url) for item in items: if item in ret: raise ValueError("Duplicate items in ret?") ret.append(item) return ret def go(self): self.resetStuckItems() self.log.info("Getting feed items") feedItems = self.getAllItems() self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.ASMHentai.Cl" pluginName = "ASMHentai Content Retreiver" tableKey = "asmh" urlBase = "https://asmhentai.com/" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "HentaiItems" retreivalThreads = 6 itemLimit = 220 shouldCanonize = False def getFileName(self, soup): title = soup.find("h1", class_="otitle") if not title: raise ValueError("Could not find title. Wat?") return title.get_text() def build_links(self, imtag, selector): imgurl = imtag['src'] imgurl = urllib.parse.urljoin(self.urlBase, imgurl) # This is brittle too urlprefix, fname = imgurl.rsplit("/", 1) fname, fext = os.path.splitext(fname) ret = [] for item in selector.find_all('option'): if item.get("value"): pageurl = urllib.parse.urljoin(self.urlBase, item.get("value")) pagenum = pageurl.strip("/").split("/")[-1] imgurl = urlprefix + "/" + str(pagenum) + fext ret.append((imgurl, pageurl)) return ret def getDownloadInfo(self, linkDict, retag=False): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) try: soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': self.urlBase}) except: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") linkDict['dirPath'] = os.path.join(settings.asmhSettings["dlDir"], linkDict['seriesName']) if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) #self.log.info(os.path.join()) read_link = soup.find("a", href=re.compile(r"/gallery/\d+?/\d+?/", re.IGNORECASE)) nav_to = urllib.parse.urljoin(self.urlBase, read_link['href']) soup = self.wg.getSoup(nav_to, addlHeaders={'Referer': sourcePage}) if soup.find_all("div", class_="g-recaptcha"): raise ScrapeExceptions.LimitedException selector = soup.find('select', class_='pag_info') imgdiv = soup.find('div', id='img') imtag = imgdiv.find('img') linkDict['originName'] = imtag['alt'] imageUrls = self.build_links(imtag, selector) self.log.info("Found %s image urls!", len(imageUrls)) linkDict["dlLinks"] = imageUrls self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) return linkDict def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content) / 1000.0) return fileN, content def fetchImages(self, linkDict): images = [] for imgUrl, referrerUrl in linkDict["dlLinks"]: images.append(self.getImage(imgUrl, referrerUrl)) return images def doDownload(self, linkDict, link, retag=False): images = self.fetchImages(linkDict) # self.log.info(len(content)) if images: fileN = linkDict['originName'] + ".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. chop = len(fileN) - 4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop] + fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload( linkDict["seriesName"], wholePath, pron=True, rowId=link['dbId']) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False def getLink(self, link): try: self.updateDbEntry(link["sourceUrl"], dlState=1) linkInfo = self.getDownloadInfo(link) self.doDownload(linkInfo, link) except urllib.error.URLError: self.log.error("Failure retreiving content for link %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(link["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") except IOError: self.log.error("Failure retreiving content for link %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(link["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: MISSING")
class DbLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.CrunchyRoll.Fl" pluginName = "CrunchyRoll Link Retreiver" tableKey = "cr" urlBase = "http://www.crunchyroll.com/" urlFeed = "http://www.crunchyroll.com/comics/manga/updated" ajaxRoot = "http://www.crunchyroll.com/ajax/" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" def getInfo(self, inMarkup): ret = {} soup = bs4.BeautifulSoup(inMarkup) header = soup.find("h1", class_='ellipsis') # Remove the leading breadcrumb link header.a.decompose() name = header.get_text() name = name.lstrip("> ").strip() ret["seriesName"] = name ret['retreivalTime'] = time.time() return ret def extractItemPage(self, page): # Extract the information needed to determine the ajax call that will let us get the # recent items for the series. if not page: return False indiceQuery = re.compile(r'var next_first_visible = (\d+);') jsFrag = re.compile( r" ajax_root: '/ajax/\?req=RpcApiManga_GetMangaCollectionCarouselPage',.+?},.+callback: function\(resp\)", re.DOTALL) indice = indiceQuery.search(page) frag = jsFrag.search(page) if not indice or not frag: return None paramRe = re.compile(r'params_obj: ({.+})', re.DOTALL) urlParams = paramRe.search(frag.group(0)) if not urlParams: return None # YAML insists on a space after a colon. Since our intput is # really a js literal which doesn't need (or have) those spaces, # we fudge the space in to make PyYAML not error. params = urlParams.group(1).replace(":", ": ") params = yaml.load(params) params['first_index'] = indice.group(1) params['req'] = "RpcApiManga_GetMangaCollectionCarouselPage" ajaxUrl = '%s?%s' % (self.ajaxRoot, urllib.parse.urlencode(params)) page = self.wg.getpage(ajaxUrl) if not page: return False return page def extractUrl(self, page): mangaCarousel = self.extractItemPage(page) if not mangaCarousel: return False # There is some XSS (I think?) blocking stuff, namely the whole AJAX response is # wrapped in comments to protect from certain parsing attacks or something? # ANyways, get rid of that. mangaCarousel = mangaCarousel.replace("/*-secure-", "").replace("*/", "") data = json.loads(mangaCarousel) if data['result_code'] != 1: # Failure? return False if not data['data']: return False # print(data['data'].keys()) raw = ''.join(data['data'].values()) soup = bs4.BeautifulSoup(raw) links = soup.find_all("a") ret = [] for link in links: if 'comics_read' in link['href']: link = urllib.parse.urljoin(self.urlBase, link['href']) ret.append(link) return ret def parseItem(self, pageUrl): page = self.wg.getpage(pageUrl) info = self.getInfo(page) ctntUrl = self.extractUrl(page) if not ctntUrl: return [] ret = [] for url in ctntUrl: item = {'sourceUrl': url} item.update(info) ret.append(item) self.log.info("Found %s accessible items on page!", len(ret)) for item in ret: self.log.info(" Item: '%s'", item) return ret def getFeed(self): soup = self.wg.getSoup(self.urlFeed) if not soup: return [] mainDiv = soup.find("div", id="main_content") lis = mainDiv.find_all("li", class_='group-item') ret = [] for listItem in lis: itemUrl = urllib.parse.urljoin(self.urlBase, listItem.a['href']) for item in self.parseItem(itemUrl): ret.append(item) return ret def go(self): self.resetStuckItems() dat = self.getFeed() self.processLinksIntoDB(dat)
class IMSTriggerLoader(ScrapePlugins.M.IrcGrabber.IrcQueueBase.IrcQueueBase): loggerPath = "Main.Manga.IMS.Fl" pluginName = "IMangaScans Link Retreiver" tableKey = "irc-irh" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" feedUrl = "https://imangascans.org/icebox/" extractRe = re.compile(r"packlist\.packs\[\d+\] = ({.*?});") # def getItemFromLine(self, line): # match = self.extractRe.search(line) # if not match: # raise ValueError("No data found in line %s" % line) # data = match.group(1) # data = data.replace(":", ": ") # data = yaml.safe_load(data) # print("Data", data) # pass def getMainItems(self, rangeOverride=None, rangeOffset=None): # for item in items: # self.log.info( item) # self.log.info("Loading iMangaScans Main Feed") ret = [] url = self.feedUrl page = self.wg.getpage(url) page = page.strip() matches = self.extractRe.findall(page) yamlData = "[%s]" % (", ".join(matches)) # we need to massage the markup a bit to make it parseable by PyYAML. # Basically, the raw data looks like: # {b:"Suzume", n:2180, s:7, f:"Chinatsu_no_Uta_ch23_[VISCANS].rar"}; # but {nnn}:{nnn} is not valid, YAML requires a space after the ":" # Therefore, we just replace ":" with ": " yamlData = yamlData.replace(":", ": ") self.log.info("Doing YAML data load") data = yaml.load(yamlData, Loader=yaml.CLoader) ims_botname = "[ims]icebox" # Hardcoded. Bad idea? for item in data: item["server"] = "irchighway" item["channel"] = "imangascans" # rename a few keys that are rather confusing item["size"] = item.pop("size") item["pkgNum"] = item.pop("number") item["botName"] = ims_botname item["fName"] = item.pop("name") # I'm using the filename+botname for the unique key to the database. itemKey = item["fName"] + item["botName"] item = json.dumps(item) ret.append((itemKey, item)) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break self.log.info("All data loaded") return ret def go(self): self._resetStuckItems() self.log.info("Getting feed items") feedItems = self.getMainItems() self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class FeedLoader(ScrapePlugins.LoaderBase.LoaderBase): loggerPath = "Main.Manga.Kw.Fl" pluginName = "Kawaii-Scans Link Retreiver" tableKey = "kw" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://kawaii.ca/" feedUrl = "http://kawaii.ca/reader/" def getItemPages(self, url, title): # print("Should get item for ", url) soup = self.wg.getSoup(url) ret = [] pager = soup.find("div", class_="pager") spans = pager.find_all('span') if len(spans) != 3: self.log.error("Invalid span items! Page: '%s'", url) return ret dummy_series, chapter, dummy_page = spans # First string in the tag should be "Chapter". assert 'Chapter' in list(chapter.stripped_strings)[0] for option in chapter.find_all("option"): item = {} chapUrl = '{series}/{chapter}'.format(series=url, chapter=option['value']) chapTitle = option.get_text() item["originName"] = "{series} - {file}".format(series=title, file=chapTitle) item["sourceUrl"] = chapUrl item["seriesName"] = title # There is no upload date information item["retreivalTime"] = time.time() ret.append(item) return ret def getSeriesUrls(self): ret = [] print("wat?") soup = self.wg.getSoup(self.feedUrl) div = soup.find("div", class_="pager") for option in div.find_all('option'): if option['value'] == '0': continue url = 'http://kawaii.ca/reader/{manga}'.format( manga=option['value']) ret.append((url, option.get_text())) return ret def getFeed(self): self.log.info("Loading Mc Items") ret = [] seriesPages = self.getSeriesUrls() for url, title in seriesPages: itemList = self.getItemPages(url, title) for item in itemList: ret.append(item) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break self.log.info("Found %s total items", len(ret)) return ret
class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): retreivalThreads = 1 loggerPath = "Main.Manga.GoS.Cl" pluginName = "Game of Scanlation Scans Content Retreiver" tableKey = "gos" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "https://gameofscanlation.moe/" seriesBase = "https://gameofscanlation.moe/projects/" def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content) / 1000.0) return fileN, content def getImageUrls(self, baseUrl): pages = set() soup = self.wg.getSoup(baseUrl) imagesDiv = soup.find('div', class_='chapterPages') images = imagesDiv.find_all('img', class_='avatar') pageno = 1 for image in images: src = image['src'] if "pagespeed" in src: scheme, netloc, path, query, fragment = urllib.parse.urlsplit( src) root, filename = os.path.split(path) filename = filename.split(".pagespeed.")[0] if filename.startswith("x"): filename = filename[1:] path = os.path.join(root, filename) src = urllib.parse.urlunsplit( (scheme, netloc, path, query, fragment)) pages.add((pageno, src)) pageno += 1 return pages def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, chapterVol, len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName + " [GameOfScanlation.moe].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgNum, imgUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, referrer=sourceUrl) images.append([imgNum, imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imgNum, imageName, imageContent in images: arch.writestr("{:03} - {}".format(imgNum, imageName), imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): loggerPath = "Main.Manga.Wt.Fl" pluginName = "Webtoons.com Scans Link Retreiver" tableKey = "wt" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web") tableName = "MangaItems" urlBase = "http://www.webtoons.com/" seriesBase = "http://www.webtoons.com/genre" def closeDB(self): self.log.info( "Closing DB...",) self.conn.close() self.log.info( "done") def extractItemInfo(self, soup): ret = {} titleH = soup.find("h1", class_='subj') # print(titleH) titleH.div.decompose() # titleDiv = soup.find("h1", class_="ttl") ret["title"] = titleH.get_text().strip() return ret def getItemPages(self, pageUrl, historical=False): self.log.info("Should get item for '%s'", pageUrl) urlFormat = '%s&page={num}' % pageUrl pageNo = 1 ret = [] while 1: soup = self.wg.getSoup(urlFormat.format(num=pageNo)) baseInfo = self.extractItemInfo(soup) listDiv = soup.find_all("div", class_="detail_lst") if len(listDiv) != 1: raise ValueError("Found incorrect number of detail list div items! %s" % len(listDiv)) listDiv = listDiv[0] hadNew = False for listItem in listDiv.find_all("li"): if not listItem.a: continue chapSpan = listItem.find("span", class_='subj') if chapSpan.em: chapSpan.em.decompose() chapTitle = chapSpan.get_text().strip() # Fix stupid chapter naming chapTitle = chapTitle.replace("Ep. ", "c") dateSpan = listItem.find("span", class_='date') date = dateutil.parser.parse(dateSpan.get_text().strip(), fuzzy=True) item = {} url = listItem.a["href"] url = urllib.parse.urljoin(self.urlBase, url) item["originName"] = "{series} - {file}".format(series=baseInfo["title"], file=chapTitle) item["sourceUrl"] = url item["seriesName"] = baseInfo["title"] item["retreivalTime"] = time.mktime(date.timetuple()) if not item in ret: hadNew = True ret.append(item) if not historical: break if not hadNew: break pageNo += 1 self.log.info("Found %s chapters for series '%s'", len(ret), baseInfo["title"]) return ret def getSeriesUrls(self): ret = set() soup = self.wg.getSoup(self.seriesBase) lists = soup.find_all("ul", class_='card_lst') for subList in lists: for series in subList.find_all("li"): url = urllib.parse.urljoin(self.urlBase, series.a['href']) ret.add(url) # if td.a: # link = td.a["href"] # if self.urlBase in link: # ret.append(link) self.log.info("Found %s series", len(ret)) return ret def getAllItems(self, historical=False): # for item in items: # self.log.info( item) # self.log.info( "Loading Red Hawk Items") ret = [] seriesPages = self.getSeriesUrls() for item in seriesPages: itemList = self.getItemPages(item, historical=historical) for item in itemList: ret.append(item) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") break self.log.info("Found %s total items", len(ret)) return ret def go(self, historical=False): self.resetStuckItems() self.log.info("Getting feed items") feedItems = self.getAllItems(historical=historical) self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class HBrowseDbLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.HBrowse.Fl" pluginName = "H-Browse Link Retreiver" tableKey = "hb" urlBase = "http://www.hbrowse.com/" urlFeed = "http://www.hbrowse.com/list" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "HentaiItems" def loadFeed(self, pageOverride=None): self.log.info("Retreiving feed content...", ) if not pageOverride: pageOverride = 1 try: # I really don't get the logic behind HBrowse's path scheme. urlPath = '/list/{num}'.format(num=pageOverride) pageUrl = urllib.parse.urljoin(self.urlBase, urlPath) page = self.wg.getpage(pageUrl) except urllib.error.URLError: self.log.critical("Could not get page from HBrowse!") self.log.critical(traceback.format_exc()) return "" return page def parseItem(self, row, timestamp): ret = {} ret['retreivalTime'] = timestamp ret['sourceUrl'] = urllib.parse.urljoin(self.urlBase, row.a["href"]) titleTd = row.find("td", class_='recentTitle') ret['originName'] = titleTd.get_text() return ret def extractDate(self, row): text = row.get_text() date = parser.parse(text) timestamp = time.mktime(date.timetuple()) return timestamp def getFeed(self, pageOverride=None): # for item in items: # self.log.info(item) # page = self.loadFeed(pageOverride) soup = bs4.BeautifulSoup(page) itemTable = soup.find("table", id="recentTable") rows = itemTable.find_all("tr") ret = [] for row in rows: if row.find("td", class_='recentDate'): curTimestamp = self.extractDate(row) elif row.find("td", class_='recentTitle'): # curTimestamp is specifically not pre-defined, because I want to fail noisily if I try # to parse a link row before seeing a valid date item = self.parseItem(row, curTimestamp) ret.append(item) return ret def go(self): self.resetStuckItems() dat = self.getFeed() self.processLinksIntoDB(dat)
class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase): loggerPath = "Main.Manga.Ms.Cl" pluginName = "MangaStream.com Content Retreiver" tableKey = "ms" dbName = settings.DATABASE_DB_NAME tableName = "MangaItems" wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web") retreivalThreads = 1 def getImage(self, imageUrl, referrer): if imageUrl.startswith("//"): imageUrl = "http:" + imageUrl content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0) return fileN, content def getImageUrls(self, baseUrl): pages = set() nextUrl = baseUrl chapBase = baseUrl.rstrip('0123456789.') imnum = 1 while 1: soup = self.wg.getSoup(nextUrl) imageDiv = soup.find('div', class_='page') if not imageDiv.a: raise ValueError("Could not find imageDiv?") pages.add((imnum, imageDiv.img['src'], nextUrl)) nextUrl = imageDiv.a['href'] if not chapBase in nextUrl: break imnum += 1 self.log.info("Found %s pages", len(pages)) return pages def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, chapterVol, len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName+" [MangaStream.com].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgNum, imgUrl, referrerUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, referrerUrl) images.append([imgNum, imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imgNum, imageName, imageContent in images: arch.writestr("{:03} - {}".format(imgNum, imageName), imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, rowId=link['dbId']) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
class TriggerLoader(ScrapePlugins.M.IrcGrabber.IrcQueueBase.IrcQueueBase): loggerPath = "Main.Manga.Txt.Fl" pluginName = "Text-Packlist Link Retreiver" tableKey = "irc-irh" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web") tableName = "MangaItems" # format is ({packlist}, {channel}, {botname}) baseUrls = [ ("http://fth-scans.com/xdcc.txt", 'halibut', '`FTH`') ] def extractRow(self, row, channel, botName): skipFtypes = ['.mkv', '.mp4', '.avi', '.wmv'] item = {} item["server"] = "irchighway" item["channel"] = channel packno, size, filename = row item["pkgNum"] = packno.strip("#").strip() item["fName"] = filename.strip() item["size"] = size.strip() nameRe = re.compile("/msg (.+?) xdcc") item["botName"] = botName # Some of these bots have videos and shit. Skip that for skipType in skipFtypes: if item["fName"].endswith(skipType): return False return item def getBot(self, chanSet): ret = [] # print("fetching page", botPageUrl) botPageUrl, channel, botName = chanSet page = self.wg.getpage(botPageUrl) rowRe = re.compile('^#(\d+)\W+\d*x\W+\[\W*([\d\.]+)M\]\W+?(.*)$', flags=re.MULTILINE) matches = rowRe.findall(page) for match in matches: item = self.extractRow(match, channel, botName) itemKey = item["fName"]+item["botName"] item = json.dumps(item) ret.append((itemKey, item)) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") break self.log.info("Found %s items for bot", len(ret)) return ret def getMainItems(self): self.log.info( "Loading Text-Pack Feeds") ret = [] for chanSet in self.baseUrls: ret += self.getBot(chanSet) self.log.info("All data loaded") return ret
class MjContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): loggerPath = "Main.Manga.Mj.Cl" pluginName = "MangaJoy Content Retreiver" tableKey = "mj" dbName = settings.DATABASE_DB_NAME tableName = "MangaItems" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") urlBase = "http://mangajoy.com/" retreivalThreads = 6 itemLimit = 500 # Mangajoy does recompress. Arrrgh. PHASH_THRESH = 6 def checkDelay(self, inTime): return inTime < (time.time() - 60 * 30) def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content) / 1000.0) return fileN, content def getImgUrlFromPage(self, pageSoup): imgDiv = pageSoup.find("div", class_='prw') images = imgDiv.find_all("img") images = [ image for image in images if ("manga-joy" in image['src'] or "mangajoy" in image['src']) ] for image in images: print(image) if len(images) != 1: for image in images: print("Image", image) raise ValueError("Too many images found on page!") imgUrl = images[0]["src"] return imgUrl def getImageUrls(self, firstPageUrl): pageCtnt = self.wg.getpage(firstPageUrl) soup = bs4.BeautifulSoup(pageCtnt, "lxml") if 'alt="File not found"' in pageCtnt: return [] imgUrl = self.getImgUrlFromPage(soup) selectDiv = soup.find("div", class_="wpm_nav_rdr") selector = selectDiv.find("select", style="width:45px") imUrls = set([imgUrl]) if selector: pages = selector.find_all("option") # Because people are insane, sometimes a single manga has both png and jpeg files. # Since this means that we cannot just sequentially guess the image # URLs from the number of pages, we have to actually walk every image page in the manga # to get all the proper image URLs. scanPages = [ "{base}{cnt}/".format(base=firstPageUrl, cnt=page["value"]) for page in pages ] for page in scanPages: pageCtnt = self.wg.getpage(page) soup = bs4.BeautifulSoup(pageCtnt, "lxml") imUrls.add(self.getImgUrlFromPage(soup)) self.log.info("Item has %s pages.", len(imUrls)) return imUrls raise ValueError("Unable to find contained images on page '%s'" % firstPageUrl) def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link['seriesName'] chapterNameRaw = link['originName'] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterNameRaw) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterNameRaw) fqFName = os.path.join(dlPath, chapterName + "[MangaJoy].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, sourceUrl) images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterNameRaw, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload( seriesName, fqFName, deleteDups=True, includePHash=True, phashThresh=self.PHASH_THRESH) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterNameRaw, tags=dedupState) return except psycopg2.OperationalError: self.log.info("Database issue?") raise except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) def go(self): todo = self.retreiveTodoLinksFromDB() if not runStatus.run: return self.processTodoLinks(todo)
class DbLoader(ScrapePlugins.LoaderBase.LoaderBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.DoujinOnline.Fl" pluginName = "DoujinOnline Link Retreiver" tableKey = "dol" urlBase = "https://doujinshi.online/" wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web") tableName = "HentaiItems" def loadFeed(self, pageOverride=None): self.log.info("Retreiving feed content...",) if not pageOverride: pageOverride = 1 urlPath = '/page/{num}/'.format(num=pageOverride) sourceUrl = urllib.parse.urljoin(self.urlBase, urlPath) page = self.wg.getSoup(sourceUrl) return page def parseLinkDiv(self, linkdiv): dated = linkdiv.find("div", class_="dou-date") titled = linkdiv.find("div", class_="dou-title") langd = linkdiv.find("div", class_="lang-icon") if not all([langd, titled, dated]): return if not langd.img: return if not langd.img['src'].endswith("en.png"): return ret = {} ret["originName"] = titled.get_text().strip() ret["sourceUrl"] = urllib.parse.urljoin(self.urlBase, titled.a["href"]) pdate = parser.parse(dated.get_text()) ret["retreivalTime"] = calendar.timegm(pdate.utctimetuple()) # print("ret = ", ret) # print(pdate, dated.get_text()) # return return ret def getFeed(self, pageOverride=[None]): # for item in items: # self.log.info(item) # # self.wg.stepThroughCloudFlare("https://DoujinOnline.la/", titleContains="DoujinOnline.la") ret = [] for x in pageOverride: soup = self.loadFeed(x) doujinLink = soup.find_all("div", class_="dou-list") for linkLi in doujinLink: tmp = self.parseLinkDiv(linkLi) if tmp: ret.append(tmp) return ret
class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.CrunchyRoll.Cl" pluginName = "CrunchyRoll Content Retreiver" tableKey = "cr" urlBase = "http://www.crunchyroll.com/" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" retreivalThreads = 1 def getChapterId(self, apiServer, seriesId, wantChapNum): # >>> urllib.parse.urlsplit('http://api-manga.crunchyroll.com/list_chapters?series%5Fid=181&user%5Fid=null') # SplitResult(scheme='http', netloc='api-manga.crunchyroll.com', path='/list_chapters', query='series%5Fid=181&user%5Fid=null', fragment='') query = {"series_id": seriesId, "user_id": 'null'} query = urllib.parse.urlencode(query) # Crunchyroll seems to be (unnecessarily) urlescaping the underscores in the query parameters of # their AJAX request urls. Mimic that behaviour query = query.replace("_", '%5F') params = ("http", apiServer, '/list_chapters', '', query, '') url = urllib.parse.urlunparse(params) seriesInfo = self.wg.getpage(url) if not seriesInfo: return [] seriesInfo = json.loads(seriesInfo) ret = None for chapter in seriesInfo['chapters']: if chapter['viewable']: if not "locale" in chapter: continue if not 'enUS' in chapter["locale"]: continue if not 'name' in chapter["locale"]['enUS']: continue if chapter['number'] == wantChapNum: ret = (chapter['chapter_id'], chapter["locale"]['enUS']['name']) return ret def getChapterData(self, apiServer, chapterId, sessionId): # http://api-manga.crunchyroll.com/list_chapter?chapter%5Fid=6507&auth=null&session%5Fid=4q5akot51gbglzior4wxdjdqbxzhkwgd # >>> urllib.parse.urlsplit('http://api-manga.crunchyroll.com/list_chapters?series%5Fid=181&user%5Fid=null') # SplitResult(scheme='http', netloc='api-manga.crunchyroll.com', path='/list_chapters', query='series%5Fid=181&user%5Fid=null', fragment='') query = { "chapter_id": chapterId, "session_id": sessionId, "user_id": 'null', "auth": 'null' } query = urllib.parse.urlencode(query) # Crunchyroll seems to be (unnecessarily) urlescaping the underscores in the query parameters of # their AJAX request urls. Mimic that behaviour query = query.replace("_", '%5F') params = ("http", apiServer, '/list_chapter', '', query, '') url = urllib.parse.urlunparse(params) chapterInfo = self.wg.getpage(url) if not chapterInfo: return [] chapterInfo = json.loads(chapterInfo) imageUrls = [] # so there is a field in the json data named 'page_number'. However, # it seems to be almost always set to 0. Yeeeeeah..... # Theres a lot of other shit in the JSON as well. There are # cleaned pages (no typsetting), polygon extents (for client-side typesetting?) # etc... pageno = 1 for page in chapterInfo['pages']: url = page['locale']['enUS']['encrypted_composed_image_url'] if url == None or url == 'null': raise ValueError("Item has null URLs?") imageUrls.append((pageno, url)) pageno += 1 return imageUrls def fetchImageUrls(self, soup): flashConf = soup.find('param', attrs={'name': 'flashvars'}) if not flashConf: return False conf = dict(urllib.parse.parse_qsl(flashConf['value'])) apiServer = conf['server'] chapInfo = self.getChapterId(apiServer, conf['seriesId'], conf['chapterNumber']) if not chapInfo: return False chapterId, chapterName = chapInfo chapImages = self.getChapterData(apiServer, chapterId, conf['session_id']) ret = [] for imNum, url in chapImages: # AFICT, they /only/ use jpeg. # Realistically, I don't care (all internal stuff autodetects), # but it'd be nice to have the correct extensions. Assume jpeg for the moment. fname = 'img {num:05d}.jpeg'.format(num=imNum) ret.append((fname, url)) return ret, chapterName, conf['chapterNumber'] def getDownloadInfo(self, linkDict, retag=False): sourcePage = linkDict["sourceUrl"] self.log.info("Retreiving item: %s", sourcePage) try: soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': self.urlBase}) except: self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"]) raise IOError("Invalid webpage") dlPath, newDir = self.locateOrCreateDirectoryForSeries( linkDict['seriesName']) linkDict['dirPath'] = dlPath if newDir: if not linkDict["flags"]: linkDict["flags"] = '' self.updateDbEntry(sourcePage, flags=" ".join([linkDict["flags"], "haddir"])) self.conn.commit() if not os.path.exists(linkDict["dirPath"]): os.makedirs(linkDict["dirPath"]) else: self.log.info("Folder Path already exists?: %s", linkDict["dirPath"]) self.log.info("Folderpath: %s", linkDict["dirPath"]) #self.log.info(os.path.join()) urls = self.fetchImageUrls(soup) if not urls: return False imageUrls, linkDict["originName"], linkDict["chapterNo"] = urls linkDict["dlLinks"] = imageUrls self.log.info("Found %s images in manga.", len(imageUrls)) self.log.debug("Linkdict = ") for key, value in list(linkDict.items()): self.log.debug(" %s - %s", key, value) return linkDict def getImage(self, imageUrl): # the image URL format seems to be '{hash of some sort}_{creation timestamp}_main' # I checked a few common hash algos, the hash is not a pre/post decryption md5, nor sha1 content = self.wg.getpage( imageUrl, addlHeaders={ 'Referer': ' http://www.crunchyroll.com/swf/MangaViewer.swf?1' }) if not content: raise ValueError("Failed to retreive image from page '%s'!" % imageUrl) self.log.info("retreived file with a size of %0.3f K", len(content) / 1000.0) # "decrypt" the file. By XORing with 0x42. # Yeeeeeah. "Security" content = bytearray(content) for x in range(len(content)): content[x] = content[x] ^ 0x42 content = bytes(content) return content def fetchImages(self, linkDict): images = [] for filename, imgUrl in linkDict["dlLinks"]: images.append((filename, self.getImage(imgUrl))) return images def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # images = ['wat'] # print(linkDict) # self.log.info(len(content)) if images: linkDict["chapterNo"] = float(linkDict["chapterNo"]) fileN = '{series} - c{chapNo:06.1f} - {sourceName} [crunchyroll].zip'.format( series=linkDict['seriesName'], chapNo=linkDict["chapterNo"], sourceName=linkDict['originName']) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" dedupState = processDownload.processDownload( linkDict["seriesName"], wholePath, deleteDups=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2, downloadPath=linkDict["dirPath"], fileName=fileN, originName=fileN) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False def getLink(self, link): try: self.updateDbEntry(link["sourceUrl"], dlState=1) linkInfo = self.getDownloadInfo(link) if linkInfo: self.doDownload(linkInfo) else: self.updateDbEntry(link["sourceUrl"], dlState=0) except urllib.error.URLError: self.log.error("Failure retreiving content for link %s", link) self.log.error("Traceback: %s", traceback.format_exc())
class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): loggerPath = "Main.Manga.Mp.Cl" pluginName = "MangaPark Content Retreiver" tableKey = "mp" dbName = settings.DATABASE_DB_NAME tableName = "MangaItems" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") retreivalThreads = 2 def retreiveTodoLinksFromDB(self): self.log.info("Fetching items from db...", ) rows = self.getRowsByValue(dlState=0) self.log.info("Done") if not rows: return items = [] for item in rows: item["retreivalTime"] = time.gmtime(item["retreivalTime"]) items.append(item) self.log.info("Have %s new items to retreive in BtDownloader" % len(items)) items = sorted(items, key=lambda k: k["retreivalTime"], reverse=True) return items def getLinkFile(self, fileUrl): pgctnt, pghandle = self.wg.getpage( fileUrl, returnMultiple=True, addlHeaders={'Referer': "http://manga.cxcscans.com/directory/"}) pageUrl = pghandle.geturl() hName = urllib.parse.urlparse(pageUrl)[2].split("/")[-1] self.log.info( "HName: %s", hName, ) self.log.info("Size = %s", len(pgctnt)) return pgctnt, hName def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] originFileName = link["originName"] self.updateDbEntry(sourceUrl, dlState=1) self.log.info("Downloading = '%s', '%s'", seriesName, originFileName) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() try: content, headerName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return headerName = urllib.parse.unquote(headerName) fName = "%s - %s" % (originFileName, headerName) fName = nt.makeFilenameSafe(fName) fName, ext = os.path.splitext(fName) fName = "%s [CXC Scans]%s" % (fName, ext) fqFName = os.path.join(dlPath, fName) self.log.info("SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName, ext = os.path.splitext(fName) fName = "%s (%d)%s" % (fName, loop, ext) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info("Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info("Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return def go(self): todo = self.retreiveTodoLinksFromDB() if not runStatus.run: return self.processTodoLinks(todo)
class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase): loggerPath = "Main.Manga.Ki.Cl" pluginName = "Kiss Manga Content Retreiver" tableKey = "ki" dbName = settings.DATABASE_DB_NAME tableName = "MangaItems" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") retreivalThreads = 3 itemLimit = 200 def check_recaptcha(self, pgurl, soup=None, markup=None): if markup: soup = webFunctions.as_soup(markup) if not soup: raise RuntimeError( "You have to pass either the raw page markup, or a pre-parsed bs4 soup object!" ) capdiv = soup.find("div", class_='g-recaptcha') if not capdiv: if markup: return markup return soup raise ScrapeExceptions.LimitedException( "Encountered ReCaptcha! Cannot circumvent!") self.log.warning("Found ReCaptcha div. Need to circumvent.") sitekey = capdiv['data-sitekey'] # soup.find("") params = { 'key': settings.captcha_solvers['2captcha']['api_key'], 'method': 'userrecaptcha', 'googlekey': sitekey, 'pageurl': pgurl, 'json': 1, } # self.wg.getJson("https://2captcha.com/in.php", postData=params) # # here we post site key to 2captcha to get captcha ID (and we parse it here too) # captcha_id = s.post("?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(API_KEY, site_key, url), proxies=proxy).text.split('|')[1] # # then we parse gresponse from 2captcha response # recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text # print("solving ref captcha...") # while 'CAPCHA_NOT_READY' in recaptcha_answer: # sleep(5) # recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text # recaptcha_answer = recaptcha_answer.split('|')[1] # # we make the payload for the post data here, use something like mitmproxy or fiddler to see what is needed # payload = { # 'key': 'value', # 'gresponse': recaptcha_answer # This is the response from 2captcha, which is needed for the post request to go through. # } resolved = { "reUrl": "/Manga/Love-Lab-MIYAHARA-Ruri/Vol-010-Ch-001?id=359632", "g-recaptcha-response": "03AOP2lf5kLccgf5aAkMmzXR8mN6Kv6s76BoqHIv-raSzGCa98HMPMdx0n04ourhM1mBApnesMRbzr2vFa0264mY83SCkL5slCFcC-i3uWJoHIjVhGh0GN4yyswg5-yZpDg1iK882nPuxEeaxb18pOK790x4Z18ib5UOPGU-NoECVb6LS03S3b4fCjWwRDLNF43WhkHDFd7k-Os7ULCgOZe_7kcF9xbKkovCh2uuK0ytD7rhiKnZUUvl1TimGsSaFkSSrQ1C4cxZchVXrz7kIx0r6Qp2hPr2_PW0CAutCkmr9lt9TS5n0ecdVFhdVQBniSB-NZv9QEpbQ8", } # # then send the post request to the url # response = s.post(url, payload, proxies=proxy) def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content) / 1000.0) if not "." in fileN: info = handle.info() if 'Content-Type' in info: tp = info['Content-Type'] if ";" in tp: tp = tp.split(";")[0] ext = guess_extension(tp) if ext == None: ext = "unknown_ftype" print(info['Content-Type'], ext) fileN += "." + ext else: fileN += ".jpg" # Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.) guessed = magic.from_buffer(content, mime=True) ext = guess_extension(tp) if ext: fileN = fileN + ext return fileN, content def getImageUrls(self, baseUrl): pgctnt, filename, mimetype = self.wg.getItemPhantomJS(baseUrl) pgctnt = self.check_recaptcha(pgurl=baseUrl, markup=pgctnt) linkRe = re.compile(r'lstImages\.push\((wrapKA\(".+?"\))\);') links = linkRe.findall(pgctnt) pages = [] for item in links: tgt = self.wg.pjs_driver.execute_script("return %s" % item) if not tgt.startswith("http"): raise ScrapeExceptions.LimitedException( "URL Decryption failed!") pages.append(tgt) self.log.info("Found %s pages", len(pages)) return pages # Don't download items for 12 hours after relase, # so that other, (better) sources can potentially host # the items first. def checkDelay(self, inTime): return inTime < (time.time() - 60 * 60 * 12) def getLink(self, link): sourceUrl = link["sourceUrl"] print("Link", link) seriesName = link['seriesName'] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, link["originName"], len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) chapterName = nt.makeFilenameSafe(link["originName"]) fqFName = os.path.join(dlPath, chapterName + " [KissManga].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] imgCnt = 1 for imgUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, sourceUrl) imageName = "{num:03.0f} - {srcName}".format(num=imgCnt, srcName=imageName) imgCnt += 1 images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True, rowId=link['dbId']) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return except SystemExit: print("SystemExit!") raise except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) def setup(self): ''' poke through cloudflare ''' if not self.wg.stepThroughCloudFlare("http://kissmanga.com", 'KissManga'): raise ValueError( "Could not access site due to cloudflare protection.")
class BuWatchMonitor(TextScrape.NovelMixin.NovelMixin, ScrapePlugins.MonitorDbBase.MonitorDbBase): loggerPath = "Main.Manga.Bu.Watcher" pluginName = "BakaUpdates List Monitor" tableName = "MangaSeries" nameMapTableName = "muNameList" changedTableName = "muItemChanged" itemReleases = "muReleases" baseURL = "http://www.mangaupdates.com/" baseListURL = r"http://www.mangaupdates.com/mylist.html" baseReleasesURL = r"https://www.mangaupdates.com/releases.html" dbName = settings.DATABASE_DB_NAME wgH = webFunctions.WebGetRobust(logPath=loggerPath+".Web") # ----------------------------------------------------------------------------------- # Login Management tools # ----------------------------------------------------------------------------------- def checkLogin(self): checkPage = self.wgH.getpage(self.baseListURL) if "You must be a user to access this page." in checkPage: self.log.info("Whoops, need to get Login cookie") else: self.log.info("Still logged in") return logondict = {"username" : settings.buSettings["login"], "password" : settings.buSettings["passWd"], "act" : "login"} getPage = self.wgH.getpage(r"http://www.mangaupdates.com/login.html", postData=logondict) if "No user found, or error. Try again." in getPage: self.log.error("Login failed!") raise ValueError("Cannot login to MangaUpdates. Is your login/password valid?") elif "You are currently logged in as" in getPage: self.log.info("Logged in successfully!") self.wgH.saveCookies() # ----------------------------------------------------------------------------------- # Management Stuff # ----------------------------------------------------------------------------------- def go(self): self.checkLogin() self.scanRecentlyUpdated() lists = self.getListNames() for listName, listURL in lists.items(): self.updateUserListNamed(listName, listURL) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") break def getListNames(self): self.checkLogin() listDict = {} listDict["Reading"] = r"http://www.mangaupdates.com/mylist.html" # The reading list is not specifically named. pageCtnt = self.wgH.getpage(self.baseListURL) soup = bs4.BeautifulSoup(pageCtnt) add_seriesSegment = soup.find("div", id="add_series") listList = add_seriesSegment.find_previous_sibling("p", class_="text") for item in listList("a"): if "mylist.html" in item["href"] and not "act=edit" in item["href"]: # We don't want the "edit lists" list option. listDict[item.text] = item["href"] self.log.info("Retrieved %d lists", len(listDict)) for key, value in listDict.items(): self.log.debug("List name: %s, URL: %s", value, key) return listDict # ----------------------------------------------------------------------------------- # Series List scraping # ----------------------------------------------------------------------------------- def extractRow(self, row, listName): nameSegment = row.find("td", class_="lcol_nopri") if nameSegment: currentChapter = -1 link = nameSegment.find("a")["href"] mangaName = nameSegment.find("a").string urlParsed = urllib.parse.urlparse(link) if nameSegment.find("span"): chapInfo = nameSegment.find("span").string currentChapter = toInt(chapInfo) readSegment = row.find("td", class_=re.compile("lcol4")).find("a", title="Increment Chapter") if readSegment: readChapter = toInt(readSegment.string) elif listName == "Complete": readChapter = -2 else: readChapter = -1 # Update the novel information (if it exists) self.updateNovelAvailable(mangaName, currentChapter) self.updateNovelRead(mangaName, readChapter) seriesID = toInt(urlParsed.query) listName = listName.replace("\u00A0"," ") # self.log.debug("Item info = seriesID=%s, currentChapter=%s, readChapter=%s, mangaName=%s, listName=%s", seriesID, currentChapter, readChapter, mangaName, listName) # Try to match new item by both ID and name. haveRow = self.getRowsByValue(buId=seriesID) haveRow2 = self.getRowsByValue(buName=mangaName) if not haveRow: haveRow = haveRow2 if haveRow and haveRow2: if haveRow[0]["buId"] != haveRow2[0]["buId"]: print("WAT") print(haveRow[0]["buId"]) print(haveRow2[0]["buId"]) if haveRow: # print("HaveRow = ", haveRow) haveRow = haveRow.pop() self.updateDbEntry(haveRow["dbId"], commit=False, buName=mangaName, buList=listName, availProgress=currentChapter, readingProgress=readChapter, buId=seriesID) else: # ["mtList", "buList", "mtName", "mdId", "mtTags", "buName", "buId", "buTags", "readingProgress", "availProgress", "rating", "lastChanged"] self.insertIntoDb(commit=False, buName=mangaName, buList=listName, availProgress=currentChapter, readingProgress=readChapter, buId=seriesID, lastChanged=0, lastChecked=0, itemAdded=time.time()) return 1 return 0 def updateUserListNamed(self, listName, listURL): pageCtnt = self.wgH.getpage(listURL) soup = bs4.BeautifulSoup(pageCtnt) itemTable = soup.find("table", id="list_table") itemCount = 0 if not itemTable: self.log.warn("Could not find table?") self.log.warn("On page '%s'", listURL) return for row in itemTable.find_all("tr"): itemCount += self.extractRow(row, listName) listTotalNo = toInt(soup.find("div", class_="low_col1").text) if itemCount != listTotalNo: self.log.error("Invalid list reported length! Items from page: %d, found items %d", listTotalNo, itemCount) self.conn.commit() self.log.info("Properly processed all items in list!") # def scanRecentlyUpdated(self): ONE_DAY = 60*60*24 releases = self.wgH.getpage(self.baseReleasesURL) soup = bs4.BeautifulSoup(releases) content = soup.find("td", {"id": "main_content"}) titles = content.find_all("p", class_="titlesmall") for title in titles: date = title.get_text() date = dateutil.parser.parse(date, fuzzy=True) table = title.find_next_sibling("div").table for row in table.find_all("tr"): link = row.find("a", title="Series Info") # Need to skip rows with no links, (they're the table header) if link: mId = link["href"].split("=")[-1] haveRow = self.getRowByValue(buId=mId) if haveRow: checked = self.getLastCheckedFromId(mId) if checked + ONE_DAY < time.time(): self.log.info("Need to check item for id '%s'", mId) self.updateLastCheckedFromId(mId, 0) # Set last checked to zero, to force the next run to update the item # print("Checked, ", checked+ONE_DAY, "time", time.time()) # print("row", self.getRowByValue(buId=mId)) else: name = link.get_text() self.insertBareNameItems([(name, mId)]) self.log.info("New series! '%s', id '%s'", name, mId) # ----------------------------------------------------------------------------------- # General MangaUpdate mirroring tool (enqueues ALL the manga items!) # ----------------------------------------------------------------------------------- def getSeriesFromPage(self, soup): itemTable = soup.find("table", class_="series_rows_table") rows = [] for row in itemTable.find_all("tr"): if not row.find("td", class_="text"): continue tds = row.find_all("td") if len(tds) != 4: continue title, dummy_genre, dummy_year, dummy_rating = tds mId = title.a["href"].replace('https://www.mangaupdates.com/series.html?id=', '') # print("title", title.get_text(), mId) try: int(mId) rows.append((title.get_text(), mId)) except ValueError: self.log.critical("Could not extract ID? TitleTD = %s", title) return rows # TODO: Schedule this occationally def getAllManga(self): urlFormat = 'https://www.mangaupdates.com/series.html?page={page}&perpage=100' self.log.info("MU Updater scanning MangaUpdates to get all available manga.") run = 456 while run: url = urlFormat.format(page=run) run += 1 soup = self.wgH.getSoup(url) series = self.getSeriesFromPage(soup) if series: self.log.info("Inserting %s items into name DB", len(series)) self.insertBareNameItems(series) if len(series) == 0: self.log.info("No items found. At the end of the series list?") run = 0 if not runStatus.run: self.log.info( "Breaking due to exit flag being set") break self.log.info("Completed scanning all manga items.")
class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): loggerPath = "Main.Manga.Mp.Fl" pluginName = "MangaPark Link Retreiver" tableKey = "mp" dbName = settings.DATABASE_DB_NAME wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "MangaItems" urlBase = "http://mangapark.com/" feedUrl = "http://mangapark.com/latest" def checkMatureAgree(self, page, url): if "This series contains mature contents" in page: self.log.info("Need to step through mature agreement page.") page = self.wg.getpage(url, postData={"adult": "true"}) return page def getItemPages(self, info): url, series = info # print("Should get item for ", url) page = self.wg.getpage(url) page = self.checkMatureAgree(page, url) soup = bs4.BeautifulSoup(page, "lxml") main = soup.find("section", class_='manga') series = main.find("div", class_="hd") container = soup.find("div", class_="book-list") seriesName = series.get_text().strip() if seriesName.endswith(" Manga"): seriesName = seriesName[:-1 * len(" Manga")] segmentDivs = container.find_all("div", class_="stream", recursive=False) ret = [] for segment in segmentDivs: chaps = segment.find_all("li", id=re.compile(r"b-\d+")) for chap in chaps: dlLink = chap.find("a", class_="ch")["href"] dlTitle = chap.find("span").get_text().strip() dlTitle = dlTitle.replace( ":", " -") # Can't have colons in filenames # print("dlLink", dlLink, dlTitle) item = {} date = dateutil.parser.parse(chap.i.get_text(), fuzzy=True) item["originName"] = "{series} - {file}".format( series=seriesName, file=dlTitle) item["sourceUrl"] = dlLink item["seriesName"] = seriesName item["retreivalTime"] = calendar.timegm(date.timetuple()) ret.append(item) return ret def getSeriesUrls(self): ret = [] soup = self.wg.getSoup(self.feedUrl) content = soup.find('div', class_='ls1') divs = content.find_all("div", class_="item") for div in divs: # First a in the div is the title image url = div.a["href"] url = urllib.parse.urljoin(self.urlBase, url) text = div.a['title'] ret.append((url, text)) return ret def getAllItems(self): # for item in items: # self.log.info( item) # self.log.info("Loading Mc Items") ret = [] seriesPages = self.getSeriesUrls() for item in seriesPages: itemList = self.getItemPages(item) for item in itemList: ret.append(item) if not runStatus.run: self.log.info("Breaking due to exit flag being set") break self.log.info("Found %s total items", len(ret)) return ret def go(self): self.resetStuckItems() self.log.info("Getting feed items") feedItems = self.getAllItems() self.log.info("Processing feed Items") self.processLinksIntoDB(feedItems) self.log.info("Complete")
class PururinDbLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.Pururin.Fl" pluginName = "Pururin Link Retreiver" tableKey = "pu" urlBase = "http://pururin.com/" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "HentaiItems" def loadFeed(self, pageOverride=None): self.log.info("Retreiving feed content...", ) if not pageOverride: pageOverride = 1 try: # I really don't get the logic behind Pururin's path scheme. if pageOverride > 1: urlPath = '/browse/0/1{num1}/{num2}.html'.format( num1=pageOverride - 1, num2=pageOverride) pageUrl = urllib.parse.urljoin(self.urlBase, urlPath) else: # First page is just the bare URL. It /looks/ like they're blocking the root page by direct path. pageUrl = self.urlBase print("Fetching page at", pageUrl) page = self.wg.getpage(pageUrl) except urllib.error.URLError: self.log.critical("Could not get page from Pururin!") self.log.critical(traceback.format_exc()) return "" return page def parseLinkLi(self, linkLi): ret = {} ret["dlName"] = " / ".join( linkLi.h2.strings ) # Messy hack to replace <br> tags with a ' / "', rather then just removing them. ret["pageUrl"] = urllib.parse.urljoin(self.urlBase, linkLi.a["href"]) return ret def getFeed(self, pageOverride=None): # for item in items: # self.log.info(item) # page = self.loadFeed(pageOverride) soup = bs4.BeautifulSoup(page) mainSection = soup.find("ul", class_="gallery-list") doujinLink = mainSection.find_all("li", class_="gallery-block") ret = [] for linkLi in doujinLink: tmp = self.parseLinkLi(linkLi) ret.append(tmp) return ret def processLinksIntoDB(self, linksDict): self.log.info("Inserting...") newItemCount = 0 for link in linksDict: row = self.getRowsByValue(sourceUrl=link["pageUrl"]) if not row: curTime = time.time() self.insertIntoDb(retreivalTime=curTime, sourceUrl=link["pageUrl"], originName=link["dlName"], dlState=0) # cur.execute('INSERT INTO fufufuu VALUES(?, ?, ?, "", ?, ?, "", ?);',(link["date"], 0, 0, link["dlLink"], link["itemTags"], link["dlName"])) self.log.info("New item: %s", (curTime, link["pageUrl"], link["dlName"])) self.log.info("Done") self.log.info("Committing...", ) self.conn.commit() self.log.info("Committed") return newItemCount def go(self): self.resetStuckItems() dat = self.getFeed() self.processLinksIntoDB(dat)
class McContentLoader(ScrapePlugins.RetreivalBase.ScraperBase): loggerPath = "Main.Manga.Mc.Cl" pluginName = "MangaCow Content Retreiver" tableKey = "mc" dbName = settings.DATABASE_DB_NAME tableName = "MangaItems" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") retreivalThreads = 4 def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content) / 1000.0) return fileN, content def getImageUrls(self, baseUrl): pageCtnt = self.wg.getpage(baseUrl) soup = bs4.BeautifulSoup(pageCtnt) selector = soup.find("select", class_="cbo_wpm_pag") if not selector: raise ValueError("Unable to find contained images on page '%s'" % baseUrl) pageNumbers = [] for value in selector.find_all("option"): pageNumbers.append(int(value.get_text())) if not pageNumbers: raise ValueError("Unable to find contained images on page '%s'" % baseUrl) pageUrls = [] for pageNo in pageNumbers: pageUrls.append("{baseUrl}{num}/".format(baseUrl=baseUrl, num=pageNo)) # print("PageUrls", pageUrls) imageUrls = [] for pageUrl in pageUrls: pageCtnt = self.wg.getpage(pageUrl) soup = bs4.BeautifulSoup(pageCtnt) imageContainer = soup.find("div", class_="prw") url = imageContainer.img["src"] # print("Urls - ", (url, pageUrl)) imageUrls.append((url, pageUrl)) return imageUrls def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName + "[MangaCow].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl, referrerUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, referrerUrl) images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True, phashThresh=6) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
class DbLoader(ScrapePlugins.LoaderBase.LoaderBase): dbName = settings.DATABASE_DB_NAME loggerPath = "Main.Manga.Hitomi.Fl" pluginName = "Hitomi Link Retreiver" tableKey = "hit" urlBase = "https://hitomi.la/" wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web") tableName = "HentaiItems" def loadFeed(self, pageOverride=None): self.log.info("Retreiving feed content...", ) if not pageOverride: pageOverride = 1 try: urlPath = '/index-all-{num}.html'.format(num=pageOverride) sourceUrl = urllib.parse.urljoin(self.urlBase, urlPath) page = self.wg.getSoup(sourceUrl) except urllib.error.URLError: self.log.critical("Could not get page from Hitomi!") self.log.critical(traceback.format_exc()) return "" return page def parseLinkDiv(self, linkdiv): if not linkdiv.h1: return date = linkdiv.find("p", class_="date") if not date: return ret = {} for row in linkdiv.find_all("tr"): if not len(row("td")) == 2: continue param, val = row("td") param = param.get_text().strip() val = val.get_text().strip() if param.lower() == "language": # Only scrape english TLs and japanese language content. # This'll probably miss some other non-japanese content, # but they don't seem to have a "translated" tag. if val.lower() not in ['english']: self.log.info("Skipping item due to language being %s.", val) return None if param.lower() == "type": ret['seriesName'] = val.title() # Judge me if param.lower() == "tags": if "males only" in val.lower( ) and not "females only" in val.lower(): self.log.info( "Skipping item due to tag 'males only' (%s).", val.replace("\n", " ")) return None ret["originName"] = linkdiv.h1.get_text().strip() ret["sourceUrl"] = urllib.parse.urljoin(self.urlBase, linkdiv.h1.a["href"]) pdate = parser.parse(date.get_text()) ret["retreivalTime"] = calendar.timegm(pdate.utctimetuple()) return ret def getFeed(self, pageOverride=[None]): # for item in items: # self.log.info(item) # # self.wg.stepThroughCloudFlare("https://hitomi.la/", titleContains="Hitomi.la") ret = [] for x in pageOverride: soup = self.loadFeed(x) mainSection = soup.find("div", class_="gallery-content") doujinLink = mainSection.find_all( "div", class_=re.compile("(cg|dj|manga|acg)"), recursive=False) for linkLi in doujinLink: tmp = self.parseLinkDiv(linkLi) if tmp: ret.append(tmp) return ret