Python WebGetRobust示例

编程语言: Python

命名空间/包名称: webFunctions

方法/功能: WebGetRobust

hotexamples.com的示例: 30

Python WebGetRobust - 已找到30个示例。这些是从开源项目中提取的最受好评的webFunctions.WebGetRobust现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： ContentLoader.py 项目： oliuz/MangaCMS

class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):



	dbName = settings.DATABASE_DB_NAME
	loggerPath = "Main.Manga.NHentai.Cl"
	pluginName = "NHentai Content Retreiver"
	tableKey   = "nh"
	urlBase = "http://nhentai.net/"

	wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web")

	tableName = "HentaiItems"

	retreivalThreads = 6

	shouldCanonize = False

	def getFileName(self, soup):
		title = soup.find("h1", class_="otitle")
		if not title:
			raise ValueError("Could not find title. Wat?")
		return title.get_text()


	def imageUrls(self, soup):
		thumbnailDiv = soup.find("div", id="thumbnail-container")

		ret = []

		for link in thumbnailDiv.find_all("a", class_='gallerythumb'):

			referrer = urllib.parse.urljoin(self.urlBase, link['href'])
			if hasattr(link, "data-src"):
				thumbUrl = link.img['data-src']
			else:
				thumbUrl = link.img['src']

			if not "t." in thumbUrl[-6:]:
				raise ValueError("Url is not a thumb? = '%s'" % thumbUrl)
			else:
				imgUrl = thumbUrl[:-6] + thumbUrl[-6:].replace("t.", '.')

			imgUrl   = urllib.parse.urljoin(self.urlBase, imgUrl)
			imgUrl = imgUrl.replace("//t.", "//i.")

			ret.append((imgUrl, referrer))

		return ret

	def getDownloadInfo(self, linkDict, retag=False):
		sourcePage = linkDict["sourceUrl"]

		self.log.info("Retreiving item: %s", sourcePage)



		try:
			soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': self.urlBase})
		except:
			self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"])
			raise IOError("Invalid webpage")


		linkDict['dirPath'] = os.path.join(settings.nhSettings["dlDir"], linkDict['seriesName'])

		if not os.path.exists(linkDict["dirPath"]):
			os.makedirs(linkDict["dirPath"])
		else:
			self.log.info("Folder Path already exists?: %s", linkDict["dirPath"])


		self.log.info("Folderpath: %s", linkDict["dirPath"])
		#self.log.info(os.path.join())


		imageUrls = self.imageUrls(soup)

		# print("Image URLS: ", imageUrls)
		linkDict["dlLinks"] = imageUrls



		self.log.debug("Linkdict = ")
		for key, value in list(linkDict.items()):
			self.log.debug("		%s - %s", key, value)


		return linkDict

	def getImage(self, imageUrl, referrer):

		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)
		return fileN, content



	def fetchImages(self, linkDict):

		images = []
		for imgUrl, referrerUrl in linkDict["dlLinks"]:
			images.append(self.getImage(imgUrl, referrerUrl))

		return images



	def doDownload(self, linkDict, retag=False):

		images = self.fetchImages(linkDict)


		# self.log.info(len(content))

		if images:
			fileN = linkDict['originName']+".zip"
			fileN = nt.makeFilenameSafe(fileN)


			# self.log.info("geturl with processing", fileN)
			wholePath = os.path.join(linkDict["dirPath"], fileN)
			self.log.info("Complete filepath: %s", wholePath)

					#Write all downloaded files to the archive.


			chop = len(fileN)-4
			wholePath = "ERROR"
			while 1:

				try:
					fileN = fileN[:chop]+fileN[-4:]
					# self.log.info("geturl with processing", fileN)
					wholePath = os.path.join(linkDict["dirPath"], fileN)
					self.log.info("Complete filepath: %s", wholePath)

					#Write all downloaded files to the archive.

					arch = zipfile.ZipFile(wholePath, "w")
					for imageName, imageContent in images:
						arch.writestr(imageName, imageContent)
					arch.close()

					self.log.info("Successfully Saved to path: %s", wholePath)
					break
				except IOError:
					chop = chop - 1
					self.log.warn("Truncating file length to %s characters.", chop)




			if not linkDict["tags"]:
				linkDict["tags"] = ""



			self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN)


			# Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN
			dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, pron=True)
			self.log.info( "Done")

			if dedupState:
				self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState)


			self.updateDbEntry(linkDict["sourceUrl"], dlState=2)
			self.conn.commit()

			return wholePath

		else:

			self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED")

			self.conn.commit()
			return False


	def getLink(self, link):
		try:
			self.updateDbEntry(link["sourceUrl"], dlState=1)
			linkInfo = self.getDownloadInfo(link)
			self.doDownload(linkInfo)
		except IOError:
			self.log.error("Failure retreiving content for link %s", link)
			self.log.error("Traceback: %s", traceback.format_exc())
			self.updateDbEntry(link["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: MISSING")
		except urllib.error.URLError:
			self.log.error("Failure retreiving content for link %s", link)
			self.log.error("Traceback: %s", traceback.format_exc())
			self.updateDbEntry(link["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED")

示例#2

显示文件

文件： Loader.py 项目： oliuz/MangaCMS

class Loader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    loggerPath = "Main.Manga.Yo.Fl"
    pluginName = "YoManga Scans Link Retreiver"
    tableKey = "ym"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://yomanga.co/"
    seriesBase = "http://yomanga.co/reader/directory/%s/"

    def closeDB(self):
        self.log.info("Closing DB...", )
        self.conn.close()
        self.log.info("done")

    def doDownload(self, seriesName, dlurl, chapter_name):

        row = self.getRowsByValue(sourceUrl=dlurl, limitByKey=False)
        if row and row[0]['dlState'] != 0:
            return

        if not row:
            self.insertIntoDb(retreivalTime=time.time(),
                              sourceUrl=dlurl,
                              originName=seriesName,
                              dlState=1,
                              seriesName=seriesName)

        fctnt, fname = self.wg.getFileAndName(dlurl)

        fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName,
                                                         chap=chapter_name)
        fileN = nt.makeFilenameSafe(fileN)

        dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)
        wholePath = os.path.join(dlPath, fileN)

        self.log.info("Source name: %s", fname)
        self.log.info("Generated name: %s", fileN)

        if newDir:
            self.updateDbEntry(dlurl, flags="haddir")
            self.conn.commit()

        with open(wholePath, "wb") as fp:
            fp.write(fctnt)

        self.log.info("Successfully Saved to path: %s", wholePath)

        dedupState = processDownload.processDownload(seriesName,
                                                     wholePath,
                                                     deleteDups=True)
        if dedupState:
            self.addTags(sourceUrl=dlurl, tags=dedupState)

        self.updateDbEntry(dlurl,
                           dlState=2,
                           downloadPath=dlPath,
                           fileName=fileN,
                           originName=fileN)

        self.conn.commit()

    def getContentForItem(self, url):
        new = 0
        total = 0

        soup = self.wg.getSoup(url)

        stitle = soup.find("h1", class_='title').get_text().strip()

        chapters = soup.find_all("div", class_='element')
        for chapter in chapters:
            dlurl = chapter.find("div", class_='fleft')
            chp_name = chapter.find("div", class_="title").get_text().strip()
            wasnew = self.doDownload(stitle, dlurl.a['href'], chp_name)
            if wasnew:
                new += 1
            total += 1

        return new, total

    def getSeriesUrls(self):
        ret = set()

        self.wg.stepThroughCloudFlare(self.seriesBase % 1,
                                      titleContains='Series List')

        page = 1
        while True:
            soup = self.wg.getSoup(self.seriesBase % page)

            new = False

            rows = soup.find_all('div', class_='group')

            for row in rows:
                if row.a['href'] not in ret:
                    new = True
                    ret.add(row.a['href'])

            page += 1
            if not new:
                break

        self.log.info("Found %s series", len(ret))

        return ret

    def getAllItems(self):
        self.log.info("Loading YoManga Items")

        seriesPages = self.getSeriesUrls()

        tot_new, total_overall = 0, 0

        for item in seriesPages:

            new, total = self.getContentForItem(item)
            tot_new += new
            total_overall += total

        self.log.info("Found %s total items, %s of which were new",
                      total_overall, tot_new)
        return []

    def go(self):

        self.resetStuckItems()
        self.log.info("Getting feed items")

        feedItems = self.getAllItems()
        self.log.info("Processing feed Items")

        self.processLinksIntoDB(feedItems)
        self.log.info("Complete")

示例#3

显示文件

文件： ContentLoader.py 项目： gregseb/MangaCMS

class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):

    loggerPath = "Main.Manga.Dy.Cl"
    pluginName = "Dynasty Scans Content Retreiver"
    tableKey = "dy"
    dbName = settings.DATABASE_DB_NAME
    tableName = "MangaItems"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    retreivalThreads = 3

    urlBase = "http://dynasty-scans.com/"

    def getImage(self, imageUrl, referrer):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        self.log.info("retreived image '%s' with a size of %0.3f K", fileN,
                      len(content) / 1000.0)
        return fileN, content

    def getImageUrls(self, inMarkup, baseUrl):

        pages = {}

        jsonRe = re.compile(r'var pages = (\[.*?\]);')

        pg = jsonRe.findall(inMarkup)
        if len(pg) != 1:
            self.log.error("Erroring page '%s'", baseUrl)
            raise ValueError("Page has more then one json section?")

        images = json.loads(pg.pop())

        for item in images:
            imgurl = urllib.parse.urljoin(baseUrl, item['image'])
            pages[imgurl] = baseUrl

        self.log.info("Found %s pages", len(pages))

        return pages

    def getSeries(self, markup):
        soup = bs4.BeautifulSoup(markup)
        title = soup.find("h3", id='chapter-title')

        if title.b.find('a'):
            title = title.b.a.get_text()

        else:
            title = title.b.get_text()

        title = nt.getCanonicalMangaUpdatesName(title)
        print("Title '%s'" % title)
        return title

    def getLink(self, link):

        sourceUrl = link["sourceUrl"]
        chapterVol = link["originName"]

        inMarkup = self.wg.getpage(sourceUrl)

        seriesName = self.getSeries(inMarkup)

        try:
            self.log.info("Should retreive url - %s", sourceUrl)
            self.updateDbEntry(sourceUrl, dlState=1)

            imageUrls = self.getImageUrls(inMarkup, sourceUrl)
            if not imageUrls:
                self.log.critical("Failure on retreiving content at %s",
                                  sourceUrl)
                self.log.critical("Page not found - 404")
                self.updateDbEntry(sourceUrl, dlState=-1)
                return

            self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName,
                          chapterVol, len(imageUrls))
            dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

            if link["flags"] == None:
                link["flags"] = ""

            if newDir:
                self.updateDbEntry(sourceUrl,
                                   flags=" ".join([link["flags"], "haddir"]))
                self.conn.commit()

            chapterName = nt.makeFilenameSafe(chapterVol)

            fqFName = os.path.join(dlPath, chapterName + " [DynastyScans].zip")

            loop = 1
            prefix, ext = os.path.splitext(fqFName)
            while os.path.exists(fqFName):
                fqFName = "%s (%d)%s" % (prefix, loop, ext)
                loop += 1
            self.log.info("Saving to archive = %s", fqFName)

            images = []
            for imgUrl, referrerUrl in imageUrls.items():
                imageName, imageContent = self.getImage(imgUrl, referrerUrl)
                images.append([imageName, imageContent])

                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    self.updateDbEntry(sourceUrl, dlState=0)
                    return

            self.log.info("Creating archive with %s images", len(images))

            if not images:
                self.updateDbEntry(sourceUrl,
                                   dlState=-1,
                                   seriesName=seriesName,
                                   originName=chapterVol,
                                   tags="error-404")
                return

            #Write all downloaded files to the archive.
            arch = zipfile.ZipFile(fqFName, "w")
            for imageName, imageContent in images:
                arch.writestr(imageName, imageContent)
            arch.close()

            dedupState = processDownload.processDownload(seriesName,
                                                         fqFName,
                                                         deleteDups=True)
            self.log.info("Done")

            filePath, fileName = os.path.split(fqFName)
            self.updateDbEntry(sourceUrl,
                               dlState=2,
                               downloadPath=filePath,
                               fileName=fileName,
                               seriesName=seriesName,
                               originName=chapterVol,
                               tags=dedupState)
            return

        except Exception:
            self.log.critical("Failure on retreiving content at %s", sourceUrl)
            self.log.critical("Traceback = %s", traceback.format_exc())
            self.updateDbEntry(sourceUrl, dlState=-1)

示例#4

显示文件

文件： ContentLoader.py 项目： nothing628/MangaCMS

class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase):




	dbName = settings.DATABASE_DB_NAME
	loggerPath = "Main.Manga.Hitomi.Cl"
	pluginName = "Hitomi Content Retreiver"
	tableKey   = "hit"
	urlBase = "https://hitomi.la/"

	wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web")

	tableName = "HentaiItems"


	retreivalThreads = 3


	def getFileName(self, soup):
		title = soup.find("h1")
		if not title:
			raise ValueError("Could not find title. Wat?")
		return title.get_text().strip()


	def imageUrls(self, soup):
		thumbnailDiv = soup.find("div", id="thumbnail-container")

		ret = []

		for link in thumbnailDiv.find_all("a", class_='gallerythumb'):

			referrer = urllib.parse.urljoin(self.urlBase, link['href'])
			if hasattr(link, "data-src"):
				thumbUrl = link.img['data-src']
			else:
				thumbUrl = link.img['src']

			if not "t." in thumbUrl[-6:]:
				raise ValueError("Url is not a thumb? = '%s'" % thumbUrl)
			else:
				imgUrl = thumbUrl[:-6] + thumbUrl[-6:].replace("t.", '.')

			imgUrl   = urllib.parse.urljoin(self.urlBase, imgUrl)
			imgUrl = imgUrl.replace("//t.", "//i.")

			ret.append((imgUrl, referrer))

		return ret




	def format_tag(self, tag_raw):
		if "♀" in tag_raw:
			tag_raw = tag_raw.replace("♀", "")
			tag_raw = "female " + tag_raw
		if "♂" in tag_raw:
			tag_raw = tag_raw.replace("♂", "")
			tag_raw = "male " + tag_raw

		tag = tag_raw.strip()
		while "  " in tag:
			tag = tag.replace("  ", " ")
		tag = tag.strip().replace(" ", "-")
		return tag.lower()

	def getCategoryTags(self, soup):
		tablediv = soup.find("div", class_='gallery-info')
		tagTable = soup.find("table")

		tags = []

		formatters = {
						"series"     : "parody",
						"characters" : "characters",
						"tags"       : "",
					}

		ignoreTags = [
					"type",
					]

		print("soup.h2", )

		category = "Unknown?"

		for tr in tagTable.find_all("tr"):
			if len(tr.find_all("td")) != 2:
				continue

			what, values = tr.find_all("td")

			what = what.get_text().strip().lower()
			if what in ignoreTags:
				continue
			elif what == "Type":
				category = values.get_text().strip()
				if category == "Manga One-shot":
					category = "=0= One-Shot"
			elif what == "language":

				lang_tag = values.get_text().strip()
				lang_tag = self.format_tag("language " + lang_tag)
				tags.append(lang_tag)

			elif what in formatters:
				for li in values.find_all("li"):
					tag = " ".join([formatters[what], li.get_text()])
					tag = self.format_tag(tag)
					tags.append(tag)

		artist_str = "unknown artist"
		for artist in soup.h2("li"):
			artist_str = artist.get_text()
			atag = "artist " + artist_str
			atag = self.format_tag(atag)
			tags.append(atag)

		print(category, tags)
		return category, tags, artist_str

	def getDownloadInfo(self, linkDict):
		sourcePage = linkDict["sourceUrl"]

		self.log.info("Retreiving item: %s", sourcePage)

		# self.updateDbEntry(linkDict["sourceUrl"], dlState=1)

		soup = self.wg.getSoup(sourcePage, addlHeaders={'Referer': 'https://hitomi.la/'})

		if not soup:
			self.log.critical("No download at url %s! SourceUrl = %s", sourcePage, linkDict["sourceUrl"])
			raise IOError("Invalid webpage")

		gal_section = soup.find("div", class_='gallery')


		category, tags, artist = self.getCategoryTags(gal_section)
		tags = ' '.join(tags)
		linkDict['artist'] = artist
		linkDict['title'] = self.getFileName(gal_section)
		linkDict['dirPath'] = os.path.join(settings.hitSettings["dlDir"], nt.makeFilenameSafe(category))

		if not os.path.exists(linkDict["dirPath"]):
			os.makedirs(linkDict["dirPath"])
		else:
			self.log.info("Folder Path already exists?: %s", linkDict["dirPath"])

		self.log.info("Folderpath: %s", linkDict["dirPath"])

		self.log.debug("Linkdict = ")
		for key, value in list(linkDict.items()):
			self.log.debug("		%s - %s", key, value)


		if tags:
			self.log.info("Adding tag info %s", tags)
			self.addTags(sourceUrl=linkDict["sourceUrl"], tags=tags)


		read_url = soup.find("a", text=re.compile("Read Online", re.IGNORECASE))
		spage = urllib.parse.urljoin(self.urlBase, read_url['href'])

		linkDict["spage"] = spage

		self.updateDbEntry(linkDict["sourceUrl"], seriesName=category, lastUpdate=time.time())

		return linkDict

	def getImage(self, imageUrl, referrer):

		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)
		return fileN, content

	def getImages(self, linkDict):

		print("getImage", linkDict)

		soup = self.wg.getSoup(linkDict['spage'], addlHeaders={'Referer': linkDict["sourceUrl"]})

		raw_imgs = soup.find_all('div', class_="img-url")

		imageurls = []
		for div in raw_imgs:
			imgurl = div.get_text().strip()
			imgurl = re.sub(r"\/\/..?\.hitomi\.la\/", r'https://la.hitomi.la/', imgurl, flags=re.IGNORECASE)
			imageurls.append((imgurl, linkDict['spage']))

		if not imageurls:
			return []

		images = []


		for imageurl, referrer in imageurls:
			images.append(self.getImage(imageurl, referrer))

		return images


	def getLink(self, linkDict):
		try:
			linkDict = self.getDownloadInfo(linkDict)
			images = self.getImages(linkDict)
			title  = linkDict['title']
			artist = linkDict['artist']

		except webFunctions.ContentError:
			self.updateDbEntry(linkDict["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: FAILED")
			return False

		if images and title:
			fileN = title+" "+artist+".zip"
			fileN = nt.makeFilenameSafe(fileN)


			# self.log.info("geturl with processing", fileN)
			wholePath = os.path.join(linkDict["dirPath"], fileN)
			wholePath = self.insertCountIfFilenameExists(wholePath)
			self.log.info("Complete filepath: %s", wholePath)

					#Write all downloaded files to the archive.

			try:
				arch = zipfile.ZipFile(wholePath, "w")
			except OSError:
				title = title.encode('ascii','ignore').decode('ascii')
				fileN = title+".zip"
				fileN = nt.makeFilenameSafe(fileN)
				wholePath = os.path.join(linkDict["dirPath"], fileN)
				arch = zipfile.ZipFile(wholePath, "w")

			for imageName, imageContent in images:
				arch.writestr(imageName, imageContent)
			arch.close()


			self.log.info("Successfully Saved to path: %s", wholePath)


			self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN)

			# Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN
			dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=linkDict['dbId'])
			self.log.info( "Done")

			if dedupState:
				self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState)


			self.updateDbEntry(linkDict["sourceUrl"], dlState=2)


			return wholePath

		else:

			self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED")

			return False



	def setup(self):
		self.wg.stepThroughCloudFlare(self.urlBase, titleContains="Hitomi")

示例#5

显示文件

class FeedLoader(ScrapePlugins.LoaderBase.LoaderBase):

    loggerPath = "Main.Manga.Mc.Fl"
    pluginName = "MangaCow Link Retreiver"
    tableKey = "mc"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://mngcow.co/"

    feedUrl = "http://mngcow.co/manga-list/"

    def extractItemInfo(self, soup):

        ret = {}
        container = soup.find("div", class_="mng_ifo")
        infoDiv = container.find("div", class_="det")

        titleDiv = infoDiv.find("h4")
        ret["title"] = titleDiv.get_text()

        items = infoDiv.find_all("p")

        ret["note"] = " ".join(
            items[0].strings
        )  # Messy hack to replace <br> tags with a ' ', rather then just removing them.

        # And clean out the non-breaking spaces
        ret["note"] = ret["note"].replace(chr(0xa0), ' ')

        for item in items:
            text = item.get_text().strip()
            if not ":" in text:
                continue

            what, text = text.split(":", 1)
            if what == "Category":
                tags = [tag_link.get_text() for tag_link in item.find_all("a")]

                tags = [tag.lower().strip().replace(" ", "-") for tag in tags]
                ret["tags"] = " ".join(tags)

        return ret

    def getItemPages(self, url):
        # print("Should get item for ", url)
        page = self.wg.getpage(url)

        soup = bs4.BeautifulSoup(page, "lxml")
        baseInfo = self.extractItemInfo(soup)

        ret = []
        for link in soup.find_all("a", class_="lst"):
            item = {}

            url = link["href"]
            chapTitle = link.find("b", class_="val")
            chapTitle = chapTitle.get_text()

            chapDate = link.find("b", class_="dte")

            date = dateutil.parser.parse(chapDate.get_text(), fuzzy=True)

            item["originName"] = "{series} - {file}".format(
                series=baseInfo["title"], file=chapTitle)
            item["sourceUrl"] = url
            item["seriesName"] = baseInfo["title"]
            item["tags"] = baseInfo["tags"]
            item["note"] = baseInfo["note"]
            item["retreivalTime"] = calendar.timegm(date.timetuple())

            ret.append(item)

        return ret

    def getSeriesUrls(self):
        ret = []
        print("wat?")
        page = self.wg.getpage(self.feedUrl)
        soup = bs4.BeautifulSoup(page, "lxml")
        divs = soup.find_all("div", class_="img_wrp")
        for div in divs:
            url = div.a["href"]
            ret.append(url)

        return ret

    def getFeed(self):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading Mc Items")

        ret = []

        seriesPages = self.getSeriesUrls()

        for item in seriesPages:

            itemList = self.getItemPages(item)
            for item in itemList:
                ret.append(item)

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                break
        self.log.info("Found %s total items", len(ret))
        return ret

示例#6

显示文件

class DbLoader(ScrapePlugins.LoaderBase.LoaderBase):

    loggerPath = "Main.Manga.Bt.Fl"
    pluginName = "Batoto Link Retreiver"
    tableKey = "bt"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://www.bato.to/"

    feedUrl = "http://www.bato.to/?p=%d"

    def parseDateStr(self, inStr):

        # For strings like "n Days Ago", split out the "n", convert it to an int, and take the
        # time-delta so we know what actual date it refers to.

        # convert instances of "a minute ago" to "1 minute ago", for mins, hours, etc...
        inStr = inStr.strip()
        if inStr.lower().startswith("an"):
            inStr = "1" + inStr[2:]

        if inStr.lower().startswith("a"):
            inStr = "1" + inStr[1:]

        if "just now" in inStr:
            updateDate = datetime.datetime.now()
        elif "months ago" in inStr or "month ago" in inStr:
            monthsAgo = inStr.split()[0]
            monthsAgo = int(monthsAgo)
            updateDate = datetime.datetime.now() - datetime.timedelta(
                monthsAgo * 7)
        elif "weeks ago" in inStr or "week ago" in inStr:
            weeksAgo = inStr.split()[0]
            weeksAgo = int(weeksAgo)
            updateDate = datetime.datetime.now() - datetime.timedelta(
                weeksAgo * 7)
        elif "days ago" in inStr or "day ago" in inStr:
            daysAgo = inStr.split()[0]
            daysAgo = int(daysAgo)
            updateDate = datetime.datetime.now() - datetime.timedelta(daysAgo)
        elif "hours ago" in inStr or "hour ago" in inStr:
            hoursAgo = inStr.split()[0]
            hoursAgo = int(hoursAgo)
            updateDate = datetime.datetime.now() - datetime.timedelta(
                0, hoursAgo * 60 * 60)
        elif "minutes ago" in inStr or "minute ago" in inStr:
            minutesAgo = inStr.split()[0]
            minutesAgo = int(minutesAgo)
            updateDate = datetime.datetime.now() - datetime.timedelta(
                0, minutesAgo * 60)
        elif "seconds ago" in inStr or "second ago" in inStr:
            secondsAgo = inStr.split()[0]
            secondsAgo = int(secondsAgo)
            updateDate = datetime.datetime.now() - datetime.timedelta(
                0, secondsAgo)
        else:
            # self.log.warning("Date parsing failed. Using fall-back parser")
            updateDate = dateutil.parser.parse(inStr, fuzzy=True)
            # self.log.warning("Failing string = '%s'", inStr)
            # self.log.warning("As parsed = '%s'", updateDate)

        return updateDate

    def getItemFromSeriesPageContainer(self, row):

        cells = row.find_all("td")

        if len(cells) != 5:
            # self.log.error("Invalid number of TD items in row!")

            return None

        chapter, lang, dummy_scanlator, dummy_uploader, uploadDate = cells

        # Skip uploads in other languages
        if DOWNLOAD_ONLY_LANGUAGE and not DOWNLOAD_ONLY_LANGUAGE in str(lang):
            return None

        dateStr = uploadDate.get_text().strip()
        addDate = self.parseDateStr(dateStr)

        item = {}

        item["retreivalTime"] = calendar.timegm(addDate.timetuple())
        item["sourceUrl"] = chapter.a["href"]

        if not "http://bato.to/reader#" in item["sourceUrl"]:
            return False

        return item

    def fetchItemsForSeries(self, seriesUrl, historical):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading items from '%s'", seriesUrl)

        soup = self.wg.getSoup(seriesUrl)

        # Find the divs containing either new files, or the day a file was uploaded
        itemRows = soup.find_all("tr", class_=re.compile("chapter_row"))
        items = 0
        newItems = 0

        ret = []

        for itemRow in itemRows:

            item = self.getItemFromSeriesPageContainer(itemRow)

            if item:
                items += 1

                # Only fetch an item if it's less then 48 hours old, or we're running
                # in historical mode (which means fetch all the things)
                # if item["retreivalTime"] > (time.time() - 60*60*48) or historical:

                # Fukkit, just grab everything.
                if True:
                    newItems += 1
                    ret.append(item)

        self.log.info("Found %s of %s items recent enough to download for %s.",
                      newItems, items, seriesUrl)
        return ret

    def getItemsFromSeriesUrls(self, seriesItems, historical):
        ret = []
        self.log.info("Have %s items to fetch data for.", len(seriesItems))
        with ThreadPoolExecutor(max_workers=2) as executor:
            tmp = []
            for seriesUrl in seriesItems:
                tmp.append(
                    executor.submit(self.fetchItemsForSeries, seriesUrl,
                                    historical))
            for future in tmp:
                # items = self.fetchItemsForSeries(seriesUrl, historical)
                items = future.result()
                for item in items:
                    ret.append(item)
                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    break

        return ret

    def getSeriesUrl(self, row):

        cells = row.find_all("td")
        if len(cells) == 2:
            return cells.pop(0).a['href']

        return None

    def getFeed(self, rangeOverride=None, rangeOffset=None, historical=False):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading BT Main Feed")

        seriesPages = []

        if not rangeOverride:
            dayDelta = 2
        else:
            dayDelta = int(rangeOverride)
        if not rangeOffset:
            rangeOffset = 0

        seriesPages = set()

        for daysAgo in range(1, dayDelta + 1):

            url = self.feedUrl % (daysAgo + rangeOffset)
            page = self.wg.getpage(url)
            soup = bs4.BeautifulSoup(page, "lxml")

            # Find the divs containing either new files, or the day a file was uploaded
            itemRow = soup.find_all("tr", class_=re.compile("row[01]"))

            for row in itemRow:

                item = self.getSeriesUrl(row)
                if item:
                    seriesPages.add(item)

                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    break

        ret = self.getItemsFromSeriesUrls(seriesPages, historical)
        return ret

示例#7

显示文件

class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    loggerPath = "Main.Manga.Sura.Fl"
    pluginName = "Sura's Place Link Retreiver"
    tableKey = "sura"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://www.surasplace.com/"

    feedUrl = "http://www.surasplace.com/index.php/projects/popular/page{num}.html"

    def getSeriesPages(self):

        page = 1

        links = set()
        hadNew = True
        while (hadNew):
            hadNew = False
            url = self.feedUrl.format(num=page)
            soup = self.wg.getSoup(url)
            divs = soup.find_all("div", class_='lsrow')
            for div in divs:
                header = div.find("div", class_='header')
                if not header.find("span", itemprop='name'):
                    continue

                itemUrl = header.h3.a['href']
                itemName = header.h3.a.span.get_text()
                fullUrl = urllib.parse.urljoin(self.urlBase, itemUrl)

                # Apparently content is added manually, leading to some broken URLs.
                # anyways, fix those as they crop up.
                if fullUrl.endswith("htmll"):
                    fullUrl = fullUrl[:-1]

                for x in range(len(fullUrl)):
                    if fullUrl[x:] == fullUrl[:x]:
                        fullUrl = fullUrl[x:]
                        break

                if not fullUrl in links:
                    links.add(fullUrl)
                    hadNew |= True

            page += 1

        self.log.info("Found %s series-like items.", len(links))
        return links

    def extractItemInfo(self, soup):

        ret = {}

        titleDiv = soup.find("span", itemprop="name")
        ret["title"] = titleDiv.get_text()

        # Holy shit, unique IDs for each metadata field. Halle-f*****g-lujah
        tags = soup.find("div", id='field_28')

        tagitems = []
        if tags:
            for item in tags.find_all("a", class_='tag'):
                tag = item.get_text().strip()
                while "  " in tag:
                    tag = tag.replace("  ", " ")
                tag = tag.replace(" ", "-").lower()
                # print("Text:", tag)
                tagitems.append(tag)
        ret["tags"] = " ".join(tagitems)

        return ret

    def getItemPages(self, url):
        soup = self.wg.getSoup(
            url.strip(),
            addlHeaders={
                'Referer': 'http://www.surasplace.com/index.php/projects.html'
            })

        baseInfo = self.extractItemInfo(soup)

        ret = []

        contents = soup.find("div", class_='listing-desc')
        items = contents.find_all("td")

        for link in items:
            if not link.a:
                continue
            # print(link)
            item = {}

            item["sourceUrl"] = link.a["href"].strip()
            item["seriesName"] = baseInfo["title"]
            item["tags"] = baseInfo["tags"]
            item["retreivalTime"] = time.time()

            ret.append(item)

        return ret

    def getAllItems(self):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading Mc Items")

        ret = []

        seriesPages = self.getSeriesPages()

        for itemUrl in seriesPages:

            itemList = self.getItemPages(itemUrl)
            for itemUrl in itemList:
                ret.append(itemUrl)

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                break
        self.log.info("Found %s total items", len(ret))
        return ret

    def go(self):

        self.resetStuckItems()
        self.log.info("Getting feed items")

        feedItems = self.getAllItems()
        self.log.info("Processing feed Items")

        self.processLinksIntoDB(feedItems)
        self.log.info("Complete")

示例#8

显示文件

文件： Scrape.py 项目： gregseb/MangaCMS

class Scrape(TextScrape.SiteArchiver.SiteArchiver):
    tableKey = 'japtem'
    loggerPath = 'Main.Text.JapTem.Scrape'
    pluginName = 'JapTemScrape'

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    threads = 4

    feeds = ['http://japtem.com/feed/']

    baseUrl = [
        "http://japtem.com/",
        "http://www.japtem.com/",
    ]
    startUrl = baseUrl

    badwords = [
        "/viewtopic.php",
        "/memberlist.php",
        "/search.php",
        "/wp-content/plugins/",
        "/styles/prosilver/theme/",
        "/forums/",
        "/forum/",
        "/cdn-cgi/",
        "/help/",
        "?share=",
        "?popup=",
        "viewforum.php",
        "/wp-login.php",
        "/#comments",  # Ignore in-page anchor tags
        "/staff/"
    ]

    positive_keywords = ['main_content']

    negative_keywords = [
        'mw-normal-catlinks', "printfooter", "mw-panel", 'portal'
    ]

    decomposeBefore = [
        {
            'id': 'disqus_thread'
        },
    ]

    decompose = [
        {
            'class': 'slider-container'
        },
        {
            'class': 'secondarymenu-container'
        },
        {
            'class': 'mainmenu-container'
        },
        {
            'class': 'mobile-menu'
        },
        {
            'class': 'footer'
        },
        {
            'class': 'sidebar'
        },
        {
            'class': 'disqus_thread'
        },
        {
            'class': 'sharedaddy'
        },
        {
            'class': 'pagination'
        },
        {
            'class': 'scrollUp'
        },
        {
            'id': 'slider-container'
        },
        {
            'id': 'secondarymenu-container'
        },
        {
            'id': 'mainmenu-container'
        },
        {
            'id': 'mobile-menu'
        },
        {
            'id': 'footer'
        },
        {
            'id': 'sidebar'
        },
        {
            'id': 'disqus_thread'
        },
        {
            'id': 'sharedaddy'
        },
        {
            'id': 'scrollUp'
        },
    ]

示例#9

显示文件

class JzFeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    loggerPath = "Main.Manga.Jz.Fl"
    pluginName = "Japanzai Link Retreiver"
    tableKey = "jz"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    urlBase = "http://download.japanzai.com/"

    tableName = "MangaItems"

    def checkLogin(self):
        pass

    def closeDB(self):
        self.log.info("Closing DB...", )
        self.conn.close()
        self.log.info("done")

    def quoteUrl(self, url):
        # print("InUrl = '%s'" % url)
        scheme, netloc, path, params, query, fragment = urllib.parse.urlparse(
            url)
        # print((scheme, netloc, path, params, query, fragment))
        path = urllib.parse.quote(path)
        params = urllib.parse.quote(params)
        query = urllib.parse.quote(query, safe="/=")
        fragment = urllib.parse.quote(fragment)
        # print((scheme, netloc, path, params, query, fragment))
        url = urllib.parse.urlunparse(
            (scheme, netloc, path, params, query, fragment))
        # print("outUrl = '%s'" % url)
        return url

    def getItemsFromContainer(self, seriesName, seriesUrl):
        self.log.info("Fetching items for series '%s'", seriesName)

        self.log.info("Using URL '%s'", seriesUrl)
        itemPage = self.wg.getpage(seriesUrl)
        soup = bs4.BeautifulSoup(itemPage, "lxml")

        linkLis = soup.find_all("li", class_="file")

        ret = []

        for linkLi in linkLis:

            item = {}
            dlUrl = urllib.parse.urljoin(seriesUrl,
                                         self.quoteUrl(linkLi.a["href"]))
            item["retreivalTime"] = time.time()
            item["originName"] = linkLi.a.get_text().rsplit("-")[0].strip()
            item["sourceUrl"] = dlUrl
            item["seriesName"] = seriesName

            ret.append(item)

        moreDirs = self.getSeriesPages(soup, seriesUrl)

        return moreDirs, ret

    def getSeriesPages(self, soup, urlBase):

        linkLis = soup.find_all("li", class_="directory")

        ret = []
        for linkLi in linkLis:
            series = linkLi.a.get_text()
            if series == "..":
                continue
            url = urllib.parse.urljoin(urlBase,
                                       self.quoteUrl(linkLi.a["href"]))
            ret.append((series, url))

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                return

        return ret

    def getMainItems(self):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading Japanzai Main Feed")

        ret = []

        basePage = self.wg.getpage(self.urlBase)
        soup = bs4.BeautifulSoup(basePage, "lxml")

        seriesPages = self.getSeriesPages(soup, self.urlBase)

        while len(seriesPages):
            seriesName, seriesUrl = seriesPages.pop()

            try:
                newDirs, newItems = self.getItemsFromContainer(
                    seriesName, seriesUrl)

                for newDir in newDirs:
                    seriesPages.append(newDir)

                for newItem in newItems:
                    ret.append(newItem)
            except urllib.error.URLError:
                self.log.error("Failed to retreive page at url '%s'",
                               seriesUrl)
                self.log.error(traceback.format_exc())

        return ret

    def go(self):

        self.resetStuckItems()
        self.log.info("Getting feed items")

        feedItems = self.getMainItems()
        self.log.info("Processing feed Items")

        self.processLinksIntoDB(feedItems)
        self.log.info("Complete")

示例#10

显示文件

文件： IrcQueue.py 项目： oliuz/MangaCMS

class TriggerLoader(ScrapePlugins.IrcGrabber.IrcQueueBase.IrcQueueBase):

    loggerPath = "Main.Manga.Iro.Fl"
    pluginName = "IrcOffer site Link Retreiver"
    tableKey = "irc-irh"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    feedUrls = [
        ("http://blah.hawt.co/", "blahmanga"),
        ("http://stupidcommotion.net/index.php?group=*", "stupidcommotion"),
        ("http://stupidcommotion.net/torako.php?group=*", "stupidcommotion"),
    ]

    def getBot(self, packUrl, channel):

        server = "irchighway"

        self.log.info("Fetching page")
        soup = self.wg.getSoup(packUrl)

        self.log.info("Done. Searching")

        header = soup.h1.get_text().strip()
        botname = header.split()[0]
        # print("Header = ", header, "bot = ", botname)

        mainTable = soup.find("table", summary="list")

        ret = []
        for row in mainTable.find_all("tr"):
            item = {}
            rowItems = row.find_all("td")
            if len(rowItems) == 4:
                pkgNum, dummy_dlcnt, size, info = rowItems

                item["pkgNum"] = pkgNum.get_text().strip("#").strip()

                item["server"] = server
                item["channel"] = channel

                sizeText = size.get_text().strip()
                # Skip all files that have sizes in bytes (just header files and shit)
                if "b" in sizeText.lower():
                    continue

                if "k" in sizeText.lower():
                    item["size"] = float(
                        sizeText.lower().strip("k").strip()) / 1000.0
                elif "g" in sizeText.lower():
                    item["size"] = float(
                        sizeText.lower().strip("g").strip()) * 1000.0
                else:
                    item["size"] = float(sizeText.lower().strip("m").strip())

                item["botName"] = botname
                if info.find("span", class_="selectable"):
                    fname = info.find("span",
                                      class_="selectable").get_text().strip()
                elif info.find("a"):
                    fname = info.a.get_text().strip().split(" ", 2)[-1]
                else:
                    raise ValueError
                item["fName"] = fname

                # I'm using the filename+botname for the unique key to the database.
                itemKey = item["fName"] + item["botName"]

                # Skip video files.
                badExts = ['.mkv', '.mp4', '.avi', '.wmv']
                if any(
                    [item["fName"].endswith(skipType)
                     for skipType in badExts]):
                    # print("Skipping", item)
                    continue

                # print(item)
                item = json.dumps(item)
                ret.append((itemKey, item))
            # else:
            # 	print("Bad row? ", row)

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                break

        self.log.info("Found %s items", len(ret))
        return ret

    def getMainItems(self):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading IrcOffer Main Feeds")

        ret = []

        for url, channel in self.feedUrls:
            ret += self.getBot(url, channel)

        self.log.info("All data loaded")
        return ret

示例#11

显示文件

class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    loggerPath = "Main.Manga.Mh.Fl"
    pluginName = "MangaHere Link Retreiver"
    tableKey = "mh"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://www.mangahere.co/"
    seriesBase = "http://www.mangahere.co/latest/"

    def closeDB(self):
        self.log.info("Closing DB...", )
        self.conn.close()
        self.log.info("done")

    def getUpdatedSeries(self, url):
        ret = set()

        soup = self.wg.getSoup(url)

        if soup.find("div", class_='manga_updates'):
            mainDiv = soup.find("div", class_='manga_updates')
        else:
            raise ValueError("Could not find listing table?")

        for child in mainDiv.find_all("dl"):
            if child.dt:
                seriesUrl = urllib.parse.urljoin(self.urlBase,
                                                 child.dt.a['href'])
                ret.add(seriesUrl)

        self.log.info("Found %s series", len(ret))

        return ret

    def getUpdatedSeriesPages(self):
        # Historical stuff goes here, if wanted.

        self.log.info("Loading MangaHere Items")

        pages = self.getUpdatedSeries(self.seriesBase)

        self.log.info("Found %s total items", len(pages))
        return pages

    # Check retreived page to see if it has a mature content warning
    # Step through if it does.
    # Returns page with actual content, either way
    def checkAdult(self, soup):
        adultPassThrough = soup.find("a", id='aYes')
        if not adultPassThrough:
            return soup

        self.log.info("Adult pass-through page. Stepping through")
        confirmLink = adultPassThrough['href']
        return self.wg.getSoup(confirmLink)

    def getSeriesInfoFromSoup(self, soup):
        # Should probably extract tagging info here. Laaaaazy
        # MangaUpdates interface does a better job anyways.
        titleA = soup.find("h1", class_='title')
        return {"seriesName": titleA.get_text().title()}

    def getChaptersFromSeriesPage(self, soup):
        table = soup.find('div', class_='detail_list')

        items = []
        for row in table.find_all("li"):
            if not row.a:
                continue  # Skip the table header row

            chapter = row.find("span", class_='left')
            date = row.find("span", class_='right')

            item = {}

            # Name is formatted "{seriesName} {bunch of spaces}\n{chapterName}"
            # Clean up that mess to "{seriesName} - {chapterName}"
            name = chapter.get_text().strip()
            name = name.replace("\n", " - ")
            while "  " in name:
                name = name.replace("  ", " ")

            item["originName"] = name
            item["sourceUrl"] = urllib.parse.urljoin(self.urlBase,
                                                     chapter.a['href'])
            dateStr = date.get_text().strip()
            itemDate, status = parsedatetime.Calendar().parse(dateStr)
            if status != 1:
                continue

            item['retreivalTime'] = calendar.timegm(itemDate)
            items.append(item)

        return items

    def getChapterLinkFromSeriesPage(self, seriesUrl):
        ret = []
        soup = self.wg.getSoup(seriesUrl)
        soup = self.checkAdult(soup)

        seriesInfo = self.getSeriesInfoFromSoup(soup)

        chapters = self.getChaptersFromSeriesPage(soup)
        for chapter in chapters:

            for key, val in seriesInfo.items(
            ):  # Copy series info into each chapter
                chapter[key] = val

            ret.append(chapter)

        self.log.info("Found %s items on page for series '%s'", len(ret),
                      seriesInfo['seriesName'])

        return ret

    def getAllItems(self):
        toScan = self.getUpdatedSeriesPages()

        ret = []

        for url in toScan:
            items = self.getChapterLinkFromSeriesPage(url)
            for item in items:
                if item in ret:
                    raise ValueError("Duplicate items in ret?")
                ret.append(item)

        return ret

    def go(self):

        self.resetStuckItems()
        self.log.info("Getting feed items")

        feedItems = self.getAllItems()
        self.log.info("Processing feed Items")

        self.processLinksIntoDB(feedItems)
        self.log.info("Complete")

示例#12

显示文件

文件： ContentLoader.py 项目： lordTyrion/MangaCMS

class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase):

    dbName = settings.DATABASE_DB_NAME
    loggerPath = "Main.Manga.ASMHentai.Cl"
    pluginName = "ASMHentai Content Retreiver"
    tableKey = "asmh"
    urlBase = "https://asmhentai.com/"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "HentaiItems"

    retreivalThreads = 6

    itemLimit = 220

    shouldCanonize = False

    def getFileName(self, soup):
        title = soup.find("h1", class_="otitle")
        if not title:
            raise ValueError("Could not find title. Wat?")
        return title.get_text()

    def build_links(self, imtag, selector):

        imgurl = imtag['src']
        imgurl = urllib.parse.urljoin(self.urlBase, imgurl)

        # This is brittle too
        urlprefix, fname = imgurl.rsplit("/", 1)
        fname, fext = os.path.splitext(fname)

        ret = []

        for item in selector.find_all('option'):
            if item.get("value"):
                pageurl = urllib.parse.urljoin(self.urlBase, item.get("value"))
                pagenum = pageurl.strip("/").split("/")[-1]
                imgurl = urlprefix + "/" + str(pagenum) + fext

                ret.append((imgurl, pageurl))

        return ret

    def getDownloadInfo(self, linkDict, retag=False):
        sourcePage = linkDict["sourceUrl"]

        self.log.info("Retreiving item: %s", sourcePage)

        try:
            soup = self.wg.getSoup(sourcePage,
                                   addlHeaders={'Referer': self.urlBase})
        except:
            self.log.critical("No download at url %s! SourceUrl = %s",
                              sourcePage, linkDict["sourceUrl"])
            raise IOError("Invalid webpage")

        linkDict['dirPath'] = os.path.join(settings.asmhSettings["dlDir"],
                                           linkDict['seriesName'])

        if not os.path.exists(linkDict["dirPath"]):
            os.makedirs(linkDict["dirPath"])
        else:
            self.log.info("Folder Path already exists?: %s",
                          linkDict["dirPath"])

        self.log.info("Folderpath: %s", linkDict["dirPath"])
        #self.log.info(os.path.join())

        read_link = soup.find("a",
                              href=re.compile(r"/gallery/\d+?/\d+?/",
                                              re.IGNORECASE))

        nav_to = urllib.parse.urljoin(self.urlBase, read_link['href'])
        soup = self.wg.getSoup(nav_to, addlHeaders={'Referer': sourcePage})
        if soup.find_all("div", class_="g-recaptcha"):
            raise ScrapeExceptions.LimitedException

        selector = soup.find('select', class_='pag_info')
        imgdiv = soup.find('div', id='img')

        imtag = imgdiv.find('img')
        linkDict['originName'] = imtag['alt']

        imageUrls = self.build_links(imtag, selector)

        self.log.info("Found %s image urls!", len(imageUrls))

        linkDict["dlLinks"] = imageUrls

        self.log.debug("Linkdict = ")
        for key, value in list(linkDict.items()):
            self.log.debug("		%s - %s", key, value)

        return linkDict

    def getImage(self, imageUrl, referrer):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        self.log.info("retreived image '%s' with a size of %0.3f K", fileN,
                      len(content) / 1000.0)
        return fileN, content

    def fetchImages(self, linkDict):

        images = []
        for imgUrl, referrerUrl in linkDict["dlLinks"]:
            images.append(self.getImage(imgUrl, referrerUrl))

        return images

    def doDownload(self, linkDict, link, retag=False):

        images = self.fetchImages(linkDict)

        # self.log.info(len(content))

        if images:
            fileN = linkDict['originName'] + ".zip"
            fileN = nt.makeFilenameSafe(fileN)

            # self.log.info("geturl with processing", fileN)
            wholePath = os.path.join(linkDict["dirPath"], fileN)
            self.log.info("Complete filepath: %s", wholePath)

            #Write all downloaded files to the archive.

            chop = len(fileN) - 4
            wholePath = "ERROR"
            while 1:

                try:
                    fileN = fileN[:chop] + fileN[-4:]
                    # self.log.info("geturl with processing", fileN)
                    wholePath = os.path.join(linkDict["dirPath"], fileN)
                    wholePath = self.insertCountIfFilenameExists(wholePath)
                    self.log.info("Complete filepath: %s", wholePath)

                    #Write all downloaded files to the archive.

                    arch = zipfile.ZipFile(wholePath, "w")
                    for imageName, imageContent in images:
                        arch.writestr(imageName, imageContent)
                    arch.close()

                    self.log.info("Successfully Saved to path: %s", wholePath)
                    break
                except IOError:
                    chop = chop - 1
                    self.log.warn("Truncating file length to %s characters.",
                                  chop)

            if not linkDict["tags"]:
                linkDict["tags"] = ""

            self.updateDbEntry(linkDict["sourceUrl"],
                               downloadPath=linkDict["dirPath"],
                               fileName=fileN)

            # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN
            dedupState = processDownload.processDownload(
                linkDict["seriesName"],
                wholePath,
                pron=True,
                rowId=link['dbId'])
            self.log.info("Done")

            if dedupState:
                self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState)

            self.updateDbEntry(linkDict["sourceUrl"], dlState=2)

            return wholePath

        else:

            self.updateDbEntry(linkDict["sourceUrl"],
                               dlState=-1,
                               downloadPath="ERROR",
                               fileName="ERROR: FAILED")

            return False

    def getLink(self, link):
        try:
            self.updateDbEntry(link["sourceUrl"], dlState=1)
            linkInfo = self.getDownloadInfo(link)
            self.doDownload(linkInfo, link)
        except urllib.error.URLError:
            self.log.error("Failure retreiving content for link %s", link)
            self.log.error("Traceback: %s", traceback.format_exc())
            self.updateDbEntry(link["sourceUrl"],
                               dlState=-1,
                               downloadPath="ERROR",
                               fileName="ERROR: FAILED")
        except IOError:
            self.log.error("Failure retreiving content for link %s", link)
            self.log.error("Traceback: %s", traceback.format_exc())
            self.updateDbEntry(link["sourceUrl"],
                               dlState=-2,
                               downloadPath="ERROR",
                               fileName="ERROR: MISSING")

示例#13

显示文件

class DbLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    dbName = settings.DATABASE_DB_NAME
    loggerPath = "Main.Manga.CrunchyRoll.Fl"
    pluginName = "CrunchyRoll Link Retreiver"
    tableKey = "cr"
    urlBase = "http://www.crunchyroll.com/"
    urlFeed = "http://www.crunchyroll.com/comics/manga/updated"
    ajaxRoot = "http://www.crunchyroll.com/ajax/"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    def getInfo(self, inMarkup):
        ret = {}
        soup = bs4.BeautifulSoup(inMarkup)
        header = soup.find("h1", class_='ellipsis')

        # Remove the leading breadcrumb link
        header.a.decompose()

        name = header.get_text()
        name = name.lstrip("> ").strip()

        ret["seriesName"] = name
        ret['retreivalTime'] = time.time()

        return ret

    def extractItemPage(self, page):
        # Extract the information needed to determine the ajax call that will let us get the
        # recent items for the series.
        if not page:
            return False

        indiceQuery = re.compile(r'var next_first_visible = (\d+);')
        jsFrag = re.compile(
            r" ajax_root: '/ajax/\?req=RpcApiManga_GetMangaCollectionCarouselPage',.+?},.+callback: function\(resp\)",
            re.DOTALL)

        indice = indiceQuery.search(page)
        frag = jsFrag.search(page)
        if not indice or not frag:
            return None

        paramRe = re.compile(r'params_obj: ({.+})', re.DOTALL)
        urlParams = paramRe.search(frag.group(0))
        if not urlParams:
            return None

        # YAML insists on a space after a colon. Since our intput is
        # really a js literal which doesn't need (or have) those spaces,
        # we fudge the space in to make PyYAML not error.
        params = urlParams.group(1).replace(":", ": ")
        params = yaml.load(params)
        params['first_index'] = indice.group(1)
        params['req'] = "RpcApiManga_GetMangaCollectionCarouselPage"
        ajaxUrl = '%s?%s' % (self.ajaxRoot, urllib.parse.urlencode(params))

        page = self.wg.getpage(ajaxUrl)
        if not page:
            return False

        return page

    def extractUrl(self, page):

        mangaCarousel = self.extractItemPage(page)
        if not mangaCarousel:
            return False

        # There is some XSS (I think?) blocking stuff, namely the whole AJAX response is
        # wrapped in comments to protect from certain parsing attacks or something?
        # ANyways, get rid of that.
        mangaCarousel = mangaCarousel.replace("/*-secure-",
                                              "").replace("*/", "")
        data = json.loads(mangaCarousel)
        if data['result_code'] != 1:
            # Failure?
            return False

        if not data['data']:
            return False

        # print(data['data'].keys())

        raw = ''.join(data['data'].values())

        soup = bs4.BeautifulSoup(raw)
        links = soup.find_all("a")

        ret = []
        for link in links:
            if 'comics_read' in link['href']:
                link = urllib.parse.urljoin(self.urlBase, link['href'])
                ret.append(link)

        return ret

    def parseItem(self, pageUrl):

        page = self.wg.getpage(pageUrl)
        info = self.getInfo(page)

        ctntUrl = self.extractUrl(page)
        if not ctntUrl:
            return []

        ret = []
        for url in ctntUrl:

            item = {'sourceUrl': url}
            item.update(info)

            ret.append(item)

        self.log.info("Found %s accessible items on page!", len(ret))
        for item in ret:
            self.log.info("	Item: '%s'", item)

        return ret

    def getFeed(self):

        soup = self.wg.getSoup(self.urlFeed)

        if not soup:
            return []

        mainDiv = soup.find("div", id="main_content")
        lis = mainDiv.find_all("li", class_='group-item')

        ret = []
        for listItem in lis:
            itemUrl = urllib.parse.urljoin(self.urlBase, listItem.a['href'])

            for item in self.parseItem(itemUrl):
                ret.append(item)

        return ret

    def go(self):
        self.resetStuckItems()
        dat = self.getFeed()

        self.processLinksIntoDB(dat)

示例#14

显示文件

class IMSTriggerLoader(ScrapePlugins.M.IrcGrabber.IrcQueueBase.IrcQueueBase):

    loggerPath = "Main.Manga.IMS.Fl"
    pluginName = "IMangaScans Link Retreiver"
    tableKey = "irc-irh"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    feedUrl = "https://imangascans.org/icebox/"

    extractRe = re.compile(r"packlist\.packs\[\d+\] = ({.*?});")

    # def getItemFromLine(self, line):
    # 	match = self.extractRe.search(line)
    # 	if not match:
    # 		raise ValueError("No data found in line %s" % line)
    # 	data = match.group(1)

    # 	data = data.replace(":", ": ")
    # 	data = yaml.safe_load(data)
    # 	print("Data", data)
    # 	pass

    def getMainItems(self, rangeOverride=None, rangeOffset=None):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading iMangaScans Main Feed")

        ret = []

        url = self.feedUrl
        page = self.wg.getpage(url)
        page = page.strip()
        matches = self.extractRe.findall(page)
        yamlData = "[%s]" % (", ".join(matches))

        # we need to massage the markup a bit to make it parseable by PyYAML.
        # Basically, the raw data looks like:
        # {b:"Suzume", n:2180, s:7, f:"Chinatsu_no_Uta_ch23_[VISCANS].rar"};
        # but {nnn}:{nnn} is not valid, YAML requires a space after the ":"
        # Therefore, we just replace ":" with ": "
        yamlData = yamlData.replace(":", ": ")

        self.log.info("Doing YAML data load")
        data = yaml.load(yamlData, Loader=yaml.CLoader)

        ims_botname = "[ims]icebox"  # Hardcoded. Bad idea?

        for item in data:
            item["server"] = "irchighway"
            item["channel"] = "imangascans"

            # rename a few keys that are rather confusing
            item["size"] = item.pop("size")
            item["pkgNum"] = item.pop("number")
            item["botName"] = ims_botname
            item["fName"] = item.pop("name")

            # I'm using the filename+botname for the unique key to the database.
            itemKey = item["fName"] + item["botName"]

            item = json.dumps(item)

            ret.append((itemKey, item))

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                break

        self.log.info("All data loaded")
        return ret

    def go(self):

        self._resetStuckItems()
        self.log.info("Getting feed items")

        feedItems = self.getMainItems()
        self.log.info("Processing feed Items")

        self.processLinksIntoDB(feedItems)
        self.log.info("Complete")

示例#15

显示文件

文件： FeedLoader.py 项目： nothing628/MangaCMS

class FeedLoader(ScrapePlugins.LoaderBase.LoaderBase):

    loggerPath = "Main.Manga.Kw.Fl"
    pluginName = "Kawaii-Scans Link Retreiver"
    tableKey = "kw"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://kawaii.ca/"

    feedUrl = "http://kawaii.ca/reader/"

    def getItemPages(self, url, title):
        # print("Should get item for ", url)

        soup = self.wg.getSoup(url)
        ret = []

        pager = soup.find("div", class_="pager")
        spans = pager.find_all('span')
        if len(spans) != 3:
            self.log.error("Invalid span items! Page: '%s'", url)
            return ret

        dummy_series, chapter, dummy_page = spans

        # First string in the tag should be "Chapter".
        assert 'Chapter' in list(chapter.stripped_strings)[0]

        for option in chapter.find_all("option"):
            item = {}

            chapUrl = '{series}/{chapter}'.format(series=url,
                                                  chapter=option['value'])
            chapTitle = option.get_text()

            item["originName"] = "{series} - {file}".format(series=title,
                                                            file=chapTitle)
            item["sourceUrl"] = chapUrl
            item["seriesName"] = title

            # There is no upload date information
            item["retreivalTime"] = time.time()

            ret.append(item)

        return ret

    def getSeriesUrls(self):
        ret = []
        print("wat?")
        soup = self.wg.getSoup(self.feedUrl)
        div = soup.find("div", class_="pager")
        for option in div.find_all('option'):
            if option['value'] == '0':
                continue
            url = 'http://kawaii.ca/reader/{manga}'.format(
                manga=option['value'])
            ret.append((url, option.get_text()))

        return ret

    def getFeed(self):

        self.log.info("Loading Mc Items")

        ret = []

        seriesPages = self.getSeriesUrls()

        for url, title in seriesPages:

            itemList = self.getItemPages(url, title)
            for item in itemList:
                ret.append(item)

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                break
        self.log.info("Found %s total items", len(ret))
        return ret

示例#16

显示文件

class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):

    retreivalThreads = 1

    loggerPath = "Main.Manga.GoS.Cl"
    pluginName = "Game of Scanlation Scans Content Retreiver"
    tableKey = "gos"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "https://gameofscanlation.moe/"
    seriesBase = "https://gameofscanlation.moe/projects/"

    def getImage(self, imageUrl, referrer):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        self.log.info("retreived image '%s' with a size of %0.3f K", fileN,
                      len(content) / 1000.0)
        return fileN, content

    def getImageUrls(self, baseUrl):
        pages = set()

        soup = self.wg.getSoup(baseUrl)

        imagesDiv = soup.find('div', class_='chapterPages')
        images = imagesDiv.find_all('img', class_='avatar')

        pageno = 1
        for image in images:
            src = image['src']
            if "pagespeed" in src:
                scheme, netloc, path, query, fragment = urllib.parse.urlsplit(
                    src)
                root, filename = os.path.split(path)
                filename = filename.split(".pagespeed.")[0]
                if filename.startswith("x"):
                    filename = filename[1:]
                path = os.path.join(root, filename)
                src = urllib.parse.urlunsplit(
                    (scheme, netloc, path, query, fragment))

            pages.add((pageno, src))
            pageno += 1

        return pages

    def getLink(self, link):
        sourceUrl = link["sourceUrl"]
        seriesName = link["seriesName"]
        chapterVol = link["originName"]

        try:
            self.log.info("Should retreive url - %s", sourceUrl)
            self.updateDbEntry(sourceUrl, dlState=1)

            imageUrls = self.getImageUrls(sourceUrl)

            if not imageUrls:
                self.log.critical("Failure on retreiving content at %s",
                                  sourceUrl)
                self.log.critical("Page not found - 404")
                self.updateDbEntry(sourceUrl, dlState=-1)
                return

            self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName,
                          chapterVol, len(imageUrls))
            dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

            if link["flags"] == None:
                link["flags"] = ""

            if newDir:
                self.updateDbEntry(sourceUrl,
                                   flags=" ".join([link["flags"], "haddir"]))
                self.conn.commit()

            chapterName = nt.makeFilenameSafe(chapterVol)

            fqFName = os.path.join(dlPath,
                                   chapterName + " [GameOfScanlation.moe].zip")

            loop = 1
            while os.path.exists(fqFName):
                fqFName, ext = os.path.splitext(fqFName)
                fqFName = "%s (%d)%s" % (fqFName, loop, ext)
                loop += 1
            self.log.info("Saving to archive = %s", fqFName)

            images = []
            for imgNum, imgUrl in imageUrls:
                imageName, imageContent = self.getImage(imgUrl,
                                                        referrer=sourceUrl)
                images.append([imgNum, imageName, imageContent])

                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    self.updateDbEntry(sourceUrl, dlState=0)
                    return

            self.log.info("Creating archive with %s images", len(images))

            if not images:
                self.updateDbEntry(sourceUrl,
                                   dlState=-1,
                                   seriesName=seriesName,
                                   originName=chapterVol,
                                   tags="error-404")
                return

            #Write all downloaded files to the archive.
            arch = zipfile.ZipFile(fqFName, "w")
            for imgNum, imageName, imageContent in images:
                arch.writestr("{:03} - {}".format(imgNum, imageName),
                              imageContent)
            arch.close()

            dedupState = processDownload.processDownload(seriesName,
                                                         fqFName,
                                                         deleteDups=True)
            self.log.info("Done")

            filePath, fileName = os.path.split(fqFName)
            self.updateDbEntry(sourceUrl,
                               dlState=2,
                               downloadPath=filePath,
                               fileName=fileName,
                               seriesName=seriesName,
                               originName=chapterVol,
                               tags=dedupState)
            return

        except Exception:
            self.log.critical("Failure on retreiving content at %s", sourceUrl)
            self.log.critical("Traceback = %s", traceback.format_exc())
            self.updateDbEntry(sourceUrl, dlState=-1)

示例#17

显示文件

文件： FeedLoader.py 项目： gregseb/MangaCMS

class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):



	loggerPath = "Main.Manga.Wt.Fl"
	pluginName = "Webtoons.com Scans Link Retreiver"
	tableKey = "wt"
	dbName = settings.DATABASE_DB_NAME

	wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web")

	tableName = "MangaItems"

	urlBase    = "http://www.webtoons.com/"
	seriesBase = "http://www.webtoons.com/genre"


	def closeDB(self):
		self.log.info( "Closing DB...",)
		self.conn.close()
		self.log.info( "done")




	def extractItemInfo(self, soup):

		ret = {}

		titleH = soup.find("h1", class_='subj')
		# print(titleH)
		titleH.div.decompose()
		# titleDiv = soup.find("h1", class_="ttl")
		ret["title"] = titleH.get_text().strip()

		return ret

	def getItemPages(self, pageUrl, historical=False):
		self.log.info("Should get item for '%s'", pageUrl)


		urlFormat = '%s&page={num}' % pageUrl

		pageNo = 1

		ret = []
		while 1:

			soup = self.wg.getSoup(urlFormat.format(num=pageNo))
			baseInfo = self.extractItemInfo(soup)

			listDiv = soup.find_all("div", class_="detail_lst")
			if len(listDiv) != 1:
				raise ValueError("Found incorrect number of detail list div items! %s" % len(listDiv))
			listDiv = listDiv[0]

			hadNew = False

			for listItem in listDiv.find_all("li"):

				if not listItem.a:
					continue

				chapSpan = listItem.find("span", class_='subj')

				if chapSpan.em:
					chapSpan.em.decompose()

				chapTitle = chapSpan.get_text().strip()

				# Fix stupid chapter naming
				chapTitle = chapTitle.replace("Ep. ", "c")

				dateSpan = listItem.find("span", class_='date')
				date = dateutil.parser.parse(dateSpan.get_text().strip(), fuzzy=True)

				item = {}

				url = listItem.a["href"]
				url = urllib.parse.urljoin(self.urlBase, url)



				item["originName"]    = "{series} - {file}".format(series=baseInfo["title"], file=chapTitle)
				item["sourceUrl"]     = url
				item["seriesName"]    = baseInfo["title"]
				item["retreivalTime"] = time.mktime(date.timetuple())

				if not item in ret:
					hadNew = True
					ret.append(item)


			if not historical:
				break
			if not hadNew:
				break

			pageNo += 1


		self.log.info("Found %s chapters for series '%s'", len(ret), baseInfo["title"])
		return ret



	def getSeriesUrls(self):
		ret = set()

		soup = self.wg.getSoup(self.seriesBase)
		lists = soup.find_all("ul", class_='card_lst')


		for subList in lists:
			for series in subList.find_all("li"):
				url = urllib.parse.urljoin(self.urlBase, series.a['href'])
				ret.add(url)
			# if td.a:
			# 	link = td.a["href"]
			# 	if self.urlBase in link:
			# 		ret.append(link)

		self.log.info("Found %s series", len(ret))

		return ret


	def getAllItems(self, historical=False):
		# for item in items:
		# 	self.log.info( item)
		#

		self.log.info( "Loading Red Hawk Items")

		ret = []

		seriesPages = self.getSeriesUrls()


		for item in seriesPages:

			itemList = self.getItemPages(item, historical=historical)
			for item in itemList:
				ret.append(item)

			if not runStatus.run:
				self.log.info( "Breaking due to exit flag being set")
				break
		self.log.info("Found %s total items", len(ret))
		return ret


	def go(self, historical=False):

		self.resetStuckItems()
		self.log.info("Getting feed items")

		feedItems = self.getAllItems(historical=historical)
		self.log.info("Processing feed Items")

		self.processLinksIntoDB(feedItems)
		self.log.info("Complete")

示例#18

显示文件

class HBrowseDbLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    dbName = settings.DATABASE_DB_NAME
    loggerPath = "Main.Manga.HBrowse.Fl"
    pluginName = "H-Browse Link Retreiver"
    tableKey = "hb"
    urlBase = "http://www.hbrowse.com/"
    urlFeed = "http://www.hbrowse.com/list"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "HentaiItems"

    def loadFeed(self, pageOverride=None):
        self.log.info("Retreiving feed content...", )
        if not pageOverride:
            pageOverride = 1
        try:
            # I really don't get the logic behind HBrowse's path scheme.
            urlPath = '/list/{num}'.format(num=pageOverride)
            pageUrl = urllib.parse.urljoin(self.urlBase, urlPath)

            page = self.wg.getpage(pageUrl)
        except urllib.error.URLError:
            self.log.critical("Could not get page from HBrowse!")
            self.log.critical(traceback.format_exc())
            return ""

        return page

    def parseItem(self, row, timestamp):
        ret = {}
        ret['retreivalTime'] = timestamp
        ret['sourceUrl'] = urllib.parse.urljoin(self.urlBase, row.a["href"])
        titleTd = row.find("td", class_='recentTitle')
        ret['originName'] = titleTd.get_text()

        return ret

    def extractDate(self, row):
        text = row.get_text()
        date = parser.parse(text)
        timestamp = time.mktime(date.timetuple())
        return timestamp

    def getFeed(self, pageOverride=None):
        # for item in items:
        # 	self.log.info(item)
        #

        page = self.loadFeed(pageOverride)

        soup = bs4.BeautifulSoup(page)

        itemTable = soup.find("table", id="recentTable")

        rows = itemTable.find_all("tr")

        ret = []
        for row in rows:

            if row.find("td", class_='recentDate'):
                curTimestamp = self.extractDate(row)

            elif row.find("td", class_='recentTitle'):
                # curTimestamp is specifically not pre-defined, because I want to fail noisily if I try
                # to parse a link row before seeing a valid date
                item = self.parseItem(row, curTimestamp)
                ret.append(item)

        return ret

    def go(self):
        self.resetStuckItems()
        dat = self.getFeed()
        self.processLinksIntoDB(dat)

示例#19

显示文件

文件： ContentLoader.py 项目： nothing628/MangaCMS

class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase):



	loggerPath = "Main.Manga.Ms.Cl"
	pluginName = "MangaStream.com Content Retreiver"
	tableKey = "ms"
	dbName = settings.DATABASE_DB_NAME
	tableName = "MangaItems"

	wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web")

	retreivalThreads = 1



	def getImage(self, imageUrl, referrer):
		if imageUrl.startswith("//"):
			imageUrl = "http:" + imageUrl
		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)
		return fileN, content

	def getImageUrls(self, baseUrl):
		pages = set()

		nextUrl = baseUrl
		chapBase = baseUrl.rstrip('0123456789.')

		imnum = 1

		while 1:
			soup = self.wg.getSoup(nextUrl)
			imageDiv = soup.find('div', class_='page')

			if not imageDiv.a:
				raise ValueError("Could not find imageDiv?")

			pages.add((imnum, imageDiv.img['src'], nextUrl))

			nextUrl = imageDiv.a['href']

			if not chapBase in nextUrl:
				break
			imnum += 1


		self.log.info("Found %s pages", len(pages))

		return pages


	def getLink(self, link):
		sourceUrl  = link["sourceUrl"]
		seriesName = link["seriesName"]
		chapterVol = link["originName"]

		try:
			self.log.info( "Should retreive url - %s", sourceUrl)
			self.updateDbEntry(sourceUrl, dlState=1)

			imageUrls = self.getImageUrls(sourceUrl)

			if not imageUrls:
				self.log.critical("Failure on retreiving content at %s", sourceUrl)
				self.log.critical("Page not found - 404")
				self.updateDbEntry(sourceUrl, dlState=-1)
				return

			self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, chapterVol, len(imageUrls))
			dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

			if link["flags"] == None:
				link["flags"] = ""

			if newDir:
				self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"]))

			chapterName = nt.makeFilenameSafe(chapterVol)

			fqFName = os.path.join(dlPath, chapterName+" [MangaStream.com].zip")

			loop = 1
			while os.path.exists(fqFName):
				fqFName, ext = os.path.splitext(fqFName)
				fqFName = "%s (%d)%s" % (fqFName, loop,  ext)
				loop += 1
			self.log.info("Saving to archive = %s", fqFName)

			images = []
			for imgNum, imgUrl, referrerUrl in imageUrls:
				imageName, imageContent = self.getImage(imgUrl, referrerUrl)
				images.append([imgNum, imageName, imageContent])

				if not runStatus.run:
					self.log.info( "Breaking due to exit flag being set")
					self.updateDbEntry(sourceUrl, dlState=0)
					return

			self.log.info("Creating archive with %s images", len(images))

			if not images:
				self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404")
				return

			#Write all downloaded files to the archive.
			arch = zipfile.ZipFile(fqFName, "w")
			for imgNum, imageName, imageContent in images:
				arch.writestr("{:03} - {}".format(imgNum, imageName), imageContent)
			arch.close()


			dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, rowId=link['dbId'])
			self.log.info( "Done")

			filePath, fileName = os.path.split(fqFName)
			self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState)
			return

		except Exception:
			self.log.critical("Failure on retreiving content at %s", sourceUrl)
			self.log.critical("Traceback = %s", traceback.format_exc())
			self.updateDbEntry(sourceUrl, dlState=-1)

示例#20

显示文件

class TriggerLoader(ScrapePlugins.M.IrcGrabber.IrcQueueBase.IrcQueueBase):



	loggerPath = "Main.Manga.Txt.Fl"
	pluginName = "Text-Packlist Link Retreiver"
	tableKey = "irc-irh"
	dbName = settings.DATABASE_DB_NAME

	wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web")

	tableName = "MangaItems"


	# format is ({packlist}, {channel}, {botname})
	baseUrls = [
		("http://fth-scans.com/xdcc.txt",      'halibut', '`FTH`')
		]

	def extractRow(self, row, channel, botName):


		skipFtypes = ['.mkv', '.mp4', '.avi', '.wmv']

		item = {}
		item["server"] = "irchighway"
		item["channel"] = channel
		packno, size, filename = row


		item["pkgNum"] = packno.strip("#").strip()
		item["fName"] = filename.strip()
		item["size"] = size.strip()

		nameRe = re.compile("/msg (.+?) xdcc")



		item["botName"] = botName

		# Some of these bots have videos and shit. Skip that
		for skipType in skipFtypes:
			if item["fName"].endswith(skipType):
				return False


		return item

	def getBot(self, chanSet):

		ret = []

		# print("fetching page", botPageUrl)

		botPageUrl, channel, botName = chanSet

		page = self.wg.getpage(botPageUrl)
		rowRe = re.compile('^#(\d+)\W+\d*x\W+\[\W*([\d\.]+)M\]\W+?(.*)$', flags=re.MULTILINE)

		matches = rowRe.findall(page)
		for match in matches:
			item = self.extractRow(match, channel, botName)

			itemKey = item["fName"]+item["botName"]
			item = json.dumps(item)
			ret.append((itemKey, item))

			if not runStatus.run:
				self.log.info( "Breaking due to exit flag being set")
				break

		self.log.info("Found %s items for bot", len(ret))
		return ret

	def getMainItems(self):


		self.log.info( "Loading Text-Pack Feeds")

		ret = []
		for chanSet in self.baseUrls:

			ret += self.getBot(chanSet)

		self.log.info("All data loaded")
		return ret

示例#21

显示文件

class MjContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):

    loggerPath = "Main.Manga.Mj.Cl"
    pluginName = "MangaJoy Content Retreiver"
    tableKey = "mj"
    dbName = settings.DATABASE_DB_NAME
    tableName = "MangaItems"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    urlBase = "http://mangajoy.com/"

    retreivalThreads = 6
    itemLimit = 500

    # Mangajoy does recompress. Arrrgh.
    PHASH_THRESH = 6

    def checkDelay(self, inTime):
        return inTime < (time.time() - 60 * 30)

    def getImage(self, imageUrl, referrer):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        self.log.info("retreived image '%s' with a size of %0.3f K", fileN,
                      len(content) / 1000.0)
        return fileN, content

    def getImgUrlFromPage(self, pageSoup):

        imgDiv = pageSoup.find("div", class_='prw')
        images = imgDiv.find_all("img")
        images = [
            image for image in images
            if ("manga-joy" in image['src'] or "mangajoy" in image['src'])
        ]
        for image in images:
            print(image)
        if len(images) != 1:
            for image in images:
                print("Image", image)
            raise ValueError("Too many images found on page!")

        imgUrl = images[0]["src"]
        return imgUrl

    def getImageUrls(self, firstPageUrl):

        pageCtnt = self.wg.getpage(firstPageUrl)
        soup = bs4.BeautifulSoup(pageCtnt, "lxml")

        if 'alt="File not found"' in pageCtnt:
            return []

        imgUrl = self.getImgUrlFromPage(soup)

        selectDiv = soup.find("div", class_="wpm_nav_rdr")
        selector = selectDiv.find("select", style="width:45px")

        imUrls = set([imgUrl])

        if selector:

            pages = selector.find_all("option")

            # Because people are insane, sometimes a single manga has both png and jpeg files.
            # Since this means that we cannot just sequentially guess the image
            # URLs from the number of pages, we have to actually walk every image page in the manga
            # to get all the proper image URLs.

            scanPages = [
                "{base}{cnt}/".format(base=firstPageUrl, cnt=page["value"])
                for page in pages
            ]
            for page in scanPages:
                pageCtnt = self.wg.getpage(page)
                soup = bs4.BeautifulSoup(pageCtnt, "lxml")
                imUrls.add(self.getImgUrlFromPage(soup))

            self.log.info("Item has %s pages.", len(imUrls))

            return imUrls

        raise ValueError("Unable to find contained images on page '%s'" %
                         firstPageUrl)

    def getLink(self, link):
        sourceUrl = link["sourceUrl"]
        seriesName = link['seriesName']
        chapterNameRaw = link['originName']

        try:
            self.log.info("Should retreive url - %s", sourceUrl)
            self.updateDbEntry(sourceUrl, dlState=1)

            imageUrls = self.getImageUrls(sourceUrl)

            if not imageUrls:
                self.log.critical("Failure on retreiving content at %s",
                                  sourceUrl)
                self.log.critical("Page not found - 404")
                self.updateDbEntry(sourceUrl, dlState=-1)
                return

            self.log.info("Downloading = '%s', '%s'", seriesName,
                          chapterNameRaw)
            dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

            if link["flags"] == None:
                link["flags"] = ""

            if newDir:
                self.updateDbEntry(sourceUrl,
                                   flags=" ".join([link["flags"], "haddir"]))
                self.conn.commit()

            chapterName = nt.makeFilenameSafe(chapterNameRaw)

            fqFName = os.path.join(dlPath, chapterName + "[MangaJoy].zip")

            loop = 1
            while os.path.exists(fqFName):
                fqFName, ext = os.path.splitext(fqFName)
                fqFName = "%s (%d)%s" % (fqFName, loop, ext)
                loop += 1
            self.log.info("Saving to archive = %s", fqFName)

            images = []
            for imgUrl in imageUrls:
                imageName, imageContent = self.getImage(imgUrl, sourceUrl)

                images.append([imageName, imageContent])

                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    self.updateDbEntry(sourceUrl, dlState=0)
                    return

            self.log.info("Creating archive with %s images", len(images))

            if not images:
                self.updateDbEntry(sourceUrl,
                                   dlState=-1,
                                   seriesName=seriesName,
                                   originName=chapterNameRaw,
                                   tags="error-404")
                return

            #Write all downloaded files to the archive.
            arch = zipfile.ZipFile(fqFName, "w")
            for imageName, imageContent in images:
                arch.writestr(imageName, imageContent)
            arch.close()

            dedupState = processDownload.processDownload(
                seriesName,
                fqFName,
                deleteDups=True,
                includePHash=True,
                phashThresh=self.PHASH_THRESH)
            self.log.info("Done")

            filePath, fileName = os.path.split(fqFName)
            self.updateDbEntry(sourceUrl,
                               dlState=2,
                               downloadPath=filePath,
                               fileName=fileName,
                               seriesName=seriesName,
                               originName=chapterNameRaw,
                               tags=dedupState)
            return

        except psycopg2.OperationalError:
            self.log.info("Database issue?")
            raise

        except Exception:
            self.log.critical("Failure on retreiving content at %s", sourceUrl)
            self.log.critical("Traceback = %s", traceback.format_exc())
            self.updateDbEntry(sourceUrl, dlState=-1)

    def go(self):

        todo = self.retreiveTodoLinksFromDB()
        if not runStatus.run:
            return
        self.processTodoLinks(todo)

示例#22

显示文件

文件： DbLoader.py 项目： nothing628/MangaCMS

class DbLoader(ScrapePlugins.LoaderBase.LoaderBase):


	dbName = settings.DATABASE_DB_NAME
	loggerPath = "Main.Manga.DoujinOnline.Fl"
	pluginName = "DoujinOnline Link Retreiver"
	tableKey    = "dol"
	urlBase = "https://doujinshi.online/"

	wg = webFunctions.WebGetRobust(logPath=loggerPath+".Web")

	tableName = "HentaiItems"

	def loadFeed(self, pageOverride=None):
		self.log.info("Retreiving feed content...",)
		if not pageOverride:
			pageOverride = 1

		urlPath = '/page/{num}/'.format(num=pageOverride)
		sourceUrl = urllib.parse.urljoin(self.urlBase, urlPath)

		page = self.wg.getSoup(sourceUrl)

		return page



	def parseLinkDiv(self, linkdiv):


		dated = linkdiv.find("div", class_="dou-date")
		titled = linkdiv.find("div", class_="dou-title")
		langd = linkdiv.find("div", class_="lang-icon")


		if not all([langd, titled, dated]):
			return

		if not langd.img:
			return
		if not langd.img['src'].endswith("en.png"):
			return


		ret = {}


		ret["originName"] = titled.get_text().strip()
		ret["sourceUrl"] = urllib.parse.urljoin(self.urlBase, titled.a["href"])


		pdate = parser.parse(dated.get_text())
		ret["retreivalTime"] = calendar.timegm(pdate.utctimetuple())

		# print("ret = ", ret)
		# print(pdate, dated.get_text())
		# return

		return ret

	def getFeed(self, pageOverride=[None]):
		# for item in items:
		# 	self.log.info(item)
		#

		# self.wg.stepThroughCloudFlare("https://DoujinOnline.la/", titleContains="DoujinOnline.la")

		ret = []

		for x in pageOverride:
			soup = self.loadFeed(x)

			doujinLink = soup.find_all("div", class_="dou-list")

			for linkLi in doujinLink:
				tmp = self.parseLinkDiv(linkLi)
				if tmp:
					ret.append(tmp)

		return ret

示例#23

显示文件

文件： ContentLoader.py 项目： gregseb/MangaCMS

class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):

    dbName = settings.DATABASE_DB_NAME
    loggerPath = "Main.Manga.CrunchyRoll.Cl"
    pluginName = "CrunchyRoll Content Retreiver"
    tableKey = "cr"
    urlBase = "http://www.crunchyroll.com/"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    retreivalThreads = 1

    def getChapterId(self, apiServer, seriesId, wantChapNum):
        # >>> urllib.parse.urlsplit('http://api-manga.crunchyroll.com/list_chapters?series%5Fid=181&user%5Fid=null')
        # SplitResult(scheme='http', netloc='api-manga.crunchyroll.com', path='/list_chapters', query='series%5Fid=181&user%5Fid=null', fragment='')
        query = {"series_id": seriesId, "user_id": 'null'}
        query = urllib.parse.urlencode(query)

        # Crunchyroll seems to be (unnecessarily) urlescaping the underscores in the query parameters of
        # their AJAX request urls. Mimic that behaviour
        query = query.replace("_", '%5F')

        params = ("http", apiServer, '/list_chapters', '', query, '')
        url = urllib.parse.urlunparse(params)

        seriesInfo = self.wg.getpage(url)
        if not seriesInfo:
            return []

        seriesInfo = json.loads(seriesInfo)

        ret = None
        for chapter in seriesInfo['chapters']:
            if chapter['viewable']:
                if not "locale" in chapter:
                    continue
                if not 'enUS' in chapter["locale"]:
                    continue
                if not 'name' in chapter["locale"]['enUS']:
                    continue

                if chapter['number'] == wantChapNum:

                    ret = (chapter['chapter_id'],
                           chapter["locale"]['enUS']['name'])
        return ret

    def getChapterData(self, apiServer, chapterId, sessionId):
        # http://api-manga.crunchyroll.com/list_chapter?chapter%5Fid=6507&auth=null&session%5Fid=4q5akot51gbglzior4wxdjdqbxzhkwgd

        # >>> urllib.parse.urlsplit('http://api-manga.crunchyroll.com/list_chapters?series%5Fid=181&user%5Fid=null')
        # SplitResult(scheme='http', netloc='api-manga.crunchyroll.com', path='/list_chapters', query='series%5Fid=181&user%5Fid=null', fragment='')
        query = {
            "chapter_id": chapterId,
            "session_id": sessionId,
            "user_id": 'null',
            "auth": 'null'
        }
        query = urllib.parse.urlencode(query)

        # Crunchyroll seems to be (unnecessarily) urlescaping the underscores in the query parameters of
        # their AJAX request urls. Mimic that behaviour
        query = query.replace("_", '%5F')

        params = ("http", apiServer, '/list_chapter', '', query, '')
        url = urllib.parse.urlunparse(params)

        chapterInfo = self.wg.getpage(url)
        if not chapterInfo:
            return []

        chapterInfo = json.loads(chapterInfo)

        imageUrls = []

        # so there is a field in the json data named 'page_number'. However,
        # it seems to be almost always set to 0. Yeeeeeah.....
        # Theres a lot of other shit in the JSON as well. There are
        # cleaned pages (no typsetting), polygon extents (for client-side typesetting?)
        # etc...
        pageno = 1
        for page in chapterInfo['pages']:
            url = page['locale']['enUS']['encrypted_composed_image_url']
            if url == None or url == 'null':
                raise ValueError("Item has null URLs?")
            imageUrls.append((pageno, url))

            pageno += 1

        return imageUrls

    def fetchImageUrls(self, soup):

        flashConf = soup.find('param', attrs={'name': 'flashvars'})
        if not flashConf:
            return False
        conf = dict(urllib.parse.parse_qsl(flashConf['value']))

        apiServer = conf['server']
        chapInfo = self.getChapterId(apiServer, conf['seriesId'],
                                     conf['chapterNumber'])
        if not chapInfo:
            return False

        chapterId, chapterName = chapInfo

        chapImages = self.getChapterData(apiServer, chapterId,
                                         conf['session_id'])

        ret = []
        for imNum, url in chapImages:
            # AFICT, they /only/ use jpeg.
            # Realistically, I don't care (all internal stuff autodetects),
            # but it'd be nice to have the correct extensions. Assume jpeg for the moment.
            fname = 'img {num:05d}.jpeg'.format(num=imNum)
            ret.append((fname, url))

        return ret, chapterName, conf['chapterNumber']

    def getDownloadInfo(self, linkDict, retag=False):
        sourcePage = linkDict["sourceUrl"]

        self.log.info("Retreiving item: %s", sourcePage)

        try:
            soup = self.wg.getSoup(sourcePage,
                                   addlHeaders={'Referer': self.urlBase})
        except:
            self.log.critical("No download at url %s! SourceUrl = %s",
                              sourcePage, linkDict["sourceUrl"])
            raise IOError("Invalid webpage")

        dlPath, newDir = self.locateOrCreateDirectoryForSeries(
            linkDict['seriesName'])
        linkDict['dirPath'] = dlPath

        if newDir:
            if not linkDict["flags"]:
                linkDict["flags"] = ''
            self.updateDbEntry(sourcePage,
                               flags=" ".join([linkDict["flags"], "haddir"]))
            self.conn.commit()

        if not os.path.exists(linkDict["dirPath"]):
            os.makedirs(linkDict["dirPath"])
        else:
            self.log.info("Folder Path already exists?: %s",
                          linkDict["dirPath"])

        self.log.info("Folderpath: %s", linkDict["dirPath"])
        #self.log.info(os.path.join())

        urls = self.fetchImageUrls(soup)
        if not urls:
            return False

        imageUrls, linkDict["originName"], linkDict["chapterNo"] = urls

        linkDict["dlLinks"] = imageUrls

        self.log.info("Found %s images in manga.", len(imageUrls))

        self.log.debug("Linkdict = ")
        for key, value in list(linkDict.items()):
            self.log.debug("		%s - %s", key, value)

        return linkDict

    def getImage(self, imageUrl):
        # the image URL format seems to be '{hash of some sort}_{creation timestamp}_main'
        # I checked a few common hash algos, the hash is not a pre/post decryption md5, nor sha1

        content = self.wg.getpage(
            imageUrl,
            addlHeaders={
                'Referer': ' http://www.crunchyroll.com/swf/MangaViewer.swf?1'
            })
        if not content:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             imageUrl)

        self.log.info("retreived file with a size of %0.3f K",
                      len(content) / 1000.0)

        # "decrypt" the file. By XORing with 0x42.
        # Yeeeeeah. "Security"
        content = bytearray(content)
        for x in range(len(content)):
            content[x] = content[x] ^ 0x42
        content = bytes(content)

        return content

    def fetchImages(self, linkDict):

        images = []
        for filename, imgUrl in linkDict["dlLinks"]:
            images.append((filename, self.getImage(imgUrl)))

        return images

    def doDownload(self, linkDict, retag=False):

        images = self.fetchImages(linkDict)
        # images = ['wat']
        # print(linkDict)
        # self.log.info(len(content))

        if images:
            linkDict["chapterNo"] = float(linkDict["chapterNo"])
            fileN = '{series} - c{chapNo:06.1f} - {sourceName} [crunchyroll].zip'.format(
                series=linkDict['seriesName'],
                chapNo=linkDict["chapterNo"],
                sourceName=linkDict['originName'])
            fileN = nt.makeFilenameSafe(fileN)

            # self.log.info("geturl with processing", fileN)
            wholePath = os.path.join(linkDict["dirPath"], fileN)
            self.log.info("Complete filepath: %s", wholePath)

            #Write all downloaded files to the archive.
            arch = zipfile.ZipFile(wholePath, "w")
            for imageName, imageContent in images:
                arch.writestr(imageName, imageContent)
            arch.close()

            self.log.info("Successfully Saved to path: %s", wholePath)

            if not linkDict["tags"]:
                linkDict["tags"] = ""

            dedupState = processDownload.processDownload(
                linkDict["seriesName"], wholePath, deleteDups=True)
            self.log.info("Done")

            if dedupState:
                self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState)

            self.updateDbEntry(linkDict["sourceUrl"],
                               dlState=2,
                               downloadPath=linkDict["dirPath"],
                               fileName=fileN,
                               originName=fileN)

            self.conn.commit()
            return wholePath

        else:

            self.updateDbEntry(linkDict["sourceUrl"],
                               dlState=-1,
                               downloadPath="ERROR",
                               fileName="ERROR: FAILED")

            self.conn.commit()
            return False

    def getLink(self, link):

        try:
            self.updateDbEntry(link["sourceUrl"], dlState=1)
            linkInfo = self.getDownloadInfo(link)
            if linkInfo:
                self.doDownload(linkInfo)
            else:
                self.updateDbEntry(link["sourceUrl"], dlState=0)
        except urllib.error.URLError:
            self.log.error("Failure retreiving content for link %s", link)
            self.log.error("Traceback: %s", traceback.format_exc())

示例#24

显示文件

class ContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):

    loggerPath = "Main.Manga.Mp.Cl"
    pluginName = "MangaPark Content Retreiver"
    tableKey = "mp"
    dbName = settings.DATABASE_DB_NAME
    tableName = "MangaItems"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    retreivalThreads = 2

    def retreiveTodoLinksFromDB(self):

        self.log.info("Fetching items from db...", )

        rows = self.getRowsByValue(dlState=0)

        self.log.info("Done")
        if not rows:
            return

        items = []
        for item in rows:

            item["retreivalTime"] = time.gmtime(item["retreivalTime"])

            items.append(item)

        self.log.info("Have %s new items to retreive in BtDownloader" %
                      len(items))

        items = sorted(items, key=lambda k: k["retreivalTime"], reverse=True)
        return items

    def getLinkFile(self, fileUrl):
        pgctnt, pghandle = self.wg.getpage(
            fileUrl,
            returnMultiple=True,
            addlHeaders={'Referer': "http://manga.cxcscans.com/directory/"})
        pageUrl = pghandle.geturl()
        hName = urllib.parse.urlparse(pageUrl)[2].split("/")[-1]
        self.log.info(
            "HName: %s",
            hName,
        )
        self.log.info("Size = %s", len(pgctnt))

        return pgctnt, hName

    def getLink(self, link):
        sourceUrl = link["sourceUrl"]
        seriesName = link["seriesName"]
        originFileName = link["originName"]

        self.updateDbEntry(sourceUrl, dlState=1)
        self.log.info("Downloading = '%s', '%s'", seriesName, originFileName)
        dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

        if link["flags"] == None:
            link["flags"] = ""

        if newDir:
            self.updateDbEntry(sourceUrl,
                               flags=" ".join([link["flags"], "haddir"]))
            self.conn.commit()

        try:
            content, headerName = self.getLinkFile(sourceUrl)
        except:
            self.log.error("Unrecoverable error retreiving content %s", link)
            self.log.error("Traceback: %s", traceback.format_exc())

            self.updateDbEntry(sourceUrl, dlState=-1)
            return

        headerName = urllib.parse.unquote(headerName)

        fName = "%s - %s" % (originFileName, headerName)
        fName = nt.makeFilenameSafe(fName)

        fName, ext = os.path.splitext(fName)
        fName = "%s [CXC Scans]%s" % (fName, ext)

        fqFName = os.path.join(dlPath, fName)
        self.log.info("SaveName = %s", fqFName)

        loop = 1
        while os.path.exists(fqFName):
            fName, ext = os.path.splitext(fName)
            fName = "%s (%d)%s" % (fName, loop, ext)
            fqFName = os.path.join(link["targetDir"], fName)
            loop += 1
        self.log.info("Writing file")

        filePath, fileName = os.path.split(fqFName)

        try:
            with open(fqFName, "wb") as fp:
                fp.write(content)
        except TypeError:
            self.log.error("Failure trying to retreive content from source %s",
                           sourceUrl)
            self.updateDbEntry(sourceUrl,
                               dlState=-4,
                               downloadPath=filePath,
                               fileName=fileName)
            return
        #self.log.info( filePath)

        dedupState = processDownload.processDownload(seriesName,
                                                     fqFName,
                                                     deleteDups=True)

        self.log.info("Done")
        self.updateDbEntry(sourceUrl,
                           dlState=2,
                           downloadPath=filePath,
                           fileName=fileName,
                           tags=dedupState)
        return

    def go(self):

        todo = self.retreiveTodoLinksFromDB()
        if not runStatus.run:
            return
        self.processTodoLinks(todo)

示例#25

显示文件

class ContentLoader(ScrapePlugins.RetreivalBase.RetreivalBase):

    loggerPath = "Main.Manga.Ki.Cl"
    pluginName = "Kiss Manga Content Retreiver"
    tableKey = "ki"
    dbName = settings.DATABASE_DB_NAME
    tableName = "MangaItems"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    retreivalThreads = 3

    itemLimit = 200

    def check_recaptcha(self, pgurl, soup=None, markup=None):
        if markup:
            soup = webFunctions.as_soup(markup)
        if not soup:
            raise RuntimeError(
                "You have to pass either the raw page markup, or a pre-parsed bs4 soup object!"
            )

        capdiv = soup.find("div", class_='g-recaptcha')
        if not capdiv:
            if markup:
                return markup
            return soup

        raise ScrapeExceptions.LimitedException(
            "Encountered ReCaptcha! Cannot circumvent!")

        self.log.warning("Found ReCaptcha div. Need to circumvent.")
        sitekey = capdiv['data-sitekey']

        # soup.find("")

        params = {
            'key': settings.captcha_solvers['2captcha']['api_key'],
            'method': 'userrecaptcha',
            'googlekey': sitekey,
            'pageurl': pgurl,
            'json': 1,
        }

        # self.wg.getJson("https://2captcha.com/in.php", postData=params)

        # # here we post site key to 2captcha to get captcha ID (and we parse it here too)
        # captcha_id = s.post("?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(API_KEY, site_key, url), proxies=proxy).text.split('|')[1]

        # # then we parse gresponse from 2captcha response
        # recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text
        # print("solving ref captcha...")
        # while 'CAPCHA_NOT_READY' in recaptcha_answer:
        # 	sleep(5)
        # 	recaptcha_answer = s.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id), proxies=proxy).text
        # recaptcha_answer = recaptcha_answer.split('|')[1]

        # # we make the payload for the post data here, use something like mitmproxy or fiddler to see what is needed
        # payload = {
        # 	'key': 'value',
        # 	'gresponse': recaptcha_answer  # This is the response from 2captcha, which is needed for the post request to go through.
        # 	}

        resolved = {
            "reUrl":
            "/Manga/Love-Lab-MIYAHARA-Ruri/Vol-010-Ch-001?id=359632",
            "g-recaptcha-response":
            "03AOP2lf5kLccgf5aAkMmzXR8mN6Kv6s76BoqHIv-raSzGCa98HMPMdx0n04ourhM1mBApnesMRbzr2vFa0264mY83SCkL5slCFcC-i3uWJoHIjVhGh0GN4yyswg5-yZpDg1iK882nPuxEeaxb18pOK790x4Z18ib5UOPGU-NoECVb6LS03S3b4fCjWwRDLNF43WhkHDFd7k-Os7ULCgOZe_7kcF9xbKkovCh2uuK0ytD7rhiKnZUUvl1TimGsSaFkSSrQ1C4cxZchVXrz7kIx0r6Qp2hPr2_PW0CAutCkmr9lt9TS5n0ecdVFhdVQBniSB-NZv9QEpbQ8",
        }
        # # then send the post request to the url
        # response = s.post(url, payload, proxies=proxy)

    def getImage(self, imageUrl, referrer):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        self.log.info("retreived image '%s' with a size of %0.3f K", fileN,
                      len(content) / 1000.0)

        if not "." in fileN:
            info = handle.info()
            if 'Content-Type' in info:
                tp = info['Content-Type']
                if ";" in tp:
                    tp = tp.split(";")[0]
                ext = guess_extension(tp)
                if ext == None:
                    ext = "unknown_ftype"
                print(info['Content-Type'], ext)
                fileN += "." + ext
            else:
                fileN += ".jpg"

        # Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.)
        guessed = magic.from_buffer(content, mime=True)
        ext = guess_extension(tp)
        if ext:
            fileN = fileN + ext

        return fileN, content

    def getImageUrls(self, baseUrl):

        pgctnt, filename, mimetype = self.wg.getItemPhantomJS(baseUrl)

        pgctnt = self.check_recaptcha(pgurl=baseUrl, markup=pgctnt)

        linkRe = re.compile(r'lstImages\.push\((wrapKA\(".+?"\))\);')

        links = linkRe.findall(pgctnt)

        pages = []
        for item in links:
            tgt = self.wg.pjs_driver.execute_script("return %s" % item)
            if not tgt.startswith("http"):
                raise ScrapeExceptions.LimitedException(
                    "URL Decryption failed!")
            pages.append(tgt)

        self.log.info("Found %s pages", len(pages))

        return pages

    # Don't download items for 12 hours after relase,
    # so that other, (better) sources can potentially host
    # the items first.
    def checkDelay(self, inTime):
        return inTime < (time.time() - 60 * 60 * 12)

    def getLink(self, link):

        sourceUrl = link["sourceUrl"]
        print("Link", link)

        seriesName = link['seriesName']

        try:
            self.log.info("Should retreive url - %s", sourceUrl)
            self.updateDbEntry(sourceUrl, dlState=1)

            imageUrls = self.getImageUrls(sourceUrl)
            if not imageUrls:
                self.log.critical("Failure on retreiving content at %s",
                                  sourceUrl)
                self.log.critical("Page not found - 404")
                self.updateDbEntry(sourceUrl, dlState=-1)
                return

            self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName,
                          link["originName"], len(imageUrls))
            dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

            if link["flags"] == None:
                link["flags"] = ""

            if newDir:
                self.updateDbEntry(sourceUrl,
                                   flags=" ".join([link["flags"], "haddir"]))

            chapterName = nt.makeFilenameSafe(link["originName"])

            fqFName = os.path.join(dlPath, chapterName + " [KissManga].zip")

            loop = 1
            prefix, ext = os.path.splitext(fqFName)
            while os.path.exists(fqFName):
                fqFName = "%s (%d)%s" % (prefix, loop, ext)
                loop += 1
            self.log.info("Saving to archive = %s", fqFName)

            images = []
            imgCnt = 1
            for imgUrl in imageUrls:
                imageName, imageContent = self.getImage(imgUrl, sourceUrl)
                imageName = "{num:03.0f} - {srcName}".format(num=imgCnt,
                                                             srcName=imageName)
                imgCnt += 1
                images.append([imageName, imageContent])

                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    self.updateDbEntry(sourceUrl, dlState=0)
                    return

            self.log.info("Creating archive with %s images", len(images))

            if not images:
                self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404")
                return

            #Write all downloaded files to the archive.
            arch = zipfile.ZipFile(fqFName, "w")
            for imageName, imageContent in images:
                arch.writestr(imageName, imageContent)
            arch.close()

            dedupState = processDownload.processDownload(seriesName,
                                                         fqFName,
                                                         deleteDups=True,
                                                         includePHash=True,
                                                         rowId=link['dbId'])
            self.log.info("Done")

            filePath, fileName = os.path.split(fqFName)
            self.updateDbEntry(sourceUrl,
                               dlState=2,
                               downloadPath=filePath,
                               fileName=fileName,
                               tags=dedupState)
            return

        except SystemExit:
            print("SystemExit!")
            raise

        except Exception:
            self.log.critical("Failure on retreiving content at %s", sourceUrl)
            self.log.critical("Traceback = %s", traceback.format_exc())
            self.updateDbEntry(sourceUrl, dlState=-1)

    def setup(self):
        '''
		poke through cloudflare
		'''
        if not self.wg.stepThroughCloudFlare("http://kissmanga.com",
                                             'KissManga'):
            raise ValueError(
                "Could not access site due to cloudflare protection.")

示例#26

显示文件

class BuWatchMonitor(TextScrape.NovelMixin.NovelMixin, ScrapePlugins.MonitorDbBase.MonitorDbBase):

	loggerPath       = "Main.Manga.Bu.Watcher"
	pluginName       = "BakaUpdates List Monitor"
	tableName        = "MangaSeries"
	nameMapTableName = "muNameList"
	changedTableName = "muItemChanged"
	itemReleases     = "muReleases"

	baseURL          = "http://www.mangaupdates.com/"
	baseListURL      = r"http://www.mangaupdates.com/mylist.html"
	baseReleasesURL  = r"https://www.mangaupdates.com/releases.html"

	dbName = settings.DATABASE_DB_NAME

	wgH = webFunctions.WebGetRobust(logPath=loggerPath+".Web")


	# -----------------------------------------------------------------------------------
	# Login Management tools
	# -----------------------------------------------------------------------------------


	def checkLogin(self):

		checkPage = self.wgH.getpage(self.baseListURL)
		if "You must be a user to access this page." in checkPage:
			self.log.info("Whoops, need to get Login cookie")
		else:
			self.log.info("Still logged in")
			return

		logondict = {"username" : settings.buSettings["login"], "password" : settings.buSettings["passWd"], "act" : "login"}
		getPage = self.wgH.getpage(r"http://www.mangaupdates.com/login.html", postData=logondict)
		if "No user found, or error. Try again." in getPage:
			self.log.error("Login failed!")
			raise ValueError("Cannot login to MangaUpdates. Is your login/password valid?")
		elif "You are currently logged in as" in getPage:
			self.log.info("Logged in successfully!")

		self.wgH.saveCookies()


	# -----------------------------------------------------------------------------------
	# Management Stuff
	# -----------------------------------------------------------------------------------


	def go(self):
		self.checkLogin()

		self.scanRecentlyUpdated()

		lists = self.getListNames()


		for listName, listURL in lists.items():
			self.updateUserListNamed(listName, listURL)

			if not runStatus.run:
				self.log.info( "Breaking due to exit flag being set")
				break


	def getListNames(self):
		self.checkLogin()

		listDict = {}
		listDict["Reading"] = r"http://www.mangaupdates.com/mylist.html"  # The reading list is not specifically named.

		pageCtnt = self.wgH.getpage(self.baseListURL)

		soup = bs4.BeautifulSoup(pageCtnt)
		add_seriesSegment = soup.find("div", id="add_series")
		listList = add_seriesSegment.find_previous_sibling("p", class_="text")
		for item in listList("a"):
			if "mylist.html" in item["href"] and not "act=edit" in item["href"]:  # We don't want the "edit lists" list option.
				listDict[item.text] = item["href"]
		self.log.info("Retrieved %d lists", len(listDict))

		for key, value in listDict.items():
			self.log.debug("List name: %s, URL: %s", value, key)

		return listDict




	# -----------------------------------------------------------------------------------
	# Series List scraping
	# -----------------------------------------------------------------------------------

	def extractRow(self, row, listName):

		nameSegment = row.find("td", class_="lcol_nopri")
		if nameSegment:

			currentChapter = -1

			link = nameSegment.find("a")["href"]
			mangaName = nameSegment.find("a").string
			urlParsed = urllib.parse.urlparse(link)
			if nameSegment.find("span"):
				chapInfo = nameSegment.find("span").string
				currentChapter = toInt(chapInfo)

			readSegment = row.find("td", class_=re.compile("lcol4")).find("a", title="Increment Chapter")
			if readSegment:
				readChapter = toInt(readSegment.string)
			elif listName == "Complete":
				readChapter = -2
			else:
				readChapter = -1

			# Update the novel information (if it exists)
			self.updateNovelAvailable(mangaName, currentChapter)
			self.updateNovelRead(mangaName, readChapter)

			seriesID = toInt(urlParsed.query)
			listName = listName.replace("\u00A0"," ")

			# self.log.debug("Item info = seriesID=%s, currentChapter=%s, readChapter=%s, mangaName=%s, listName=%s", seriesID, currentChapter, readChapter, mangaName, listName)

			# Try to match new item by both ID and name.
			haveRow = self.getRowsByValue(buId=seriesID)
			haveRow2 = self.getRowsByValue(buName=mangaName)
			if not haveRow:
				haveRow = haveRow2

			if haveRow and haveRow2:
				if haveRow[0]["buId"] != haveRow2[0]["buId"]:
					print("WAT")
					print(haveRow[0]["buId"])
					print(haveRow2[0]["buId"])

			if haveRow:
				# print("HaveRow = ", haveRow)
				haveRow = haveRow.pop()
				self.updateDbEntry(haveRow["dbId"],
					commit=False,
					buName=mangaName,
					buList=listName,
					availProgress=currentChapter,
					readingProgress=readChapter,
					buId=seriesID)
			else:
				# ["mtList", "buList", "mtName", "mdId", "mtTags", "buName", "buId", "buTags", "readingProgress", "availProgress", "rating", "lastChanged"]

				self.insertIntoDb(commit=False,
					buName=mangaName,
					buList=listName,
					availProgress=currentChapter,
					readingProgress=readChapter,
					buId=seriesID,
					lastChanged=0,
					lastChecked=0,
					itemAdded=time.time())

			return 1


		return 0

	def updateUserListNamed(self, listName, listURL):

		pageCtnt = self.wgH.getpage(listURL)
		soup = bs4.BeautifulSoup(pageCtnt)
		itemTable = soup.find("table", id="list_table")


		itemCount = 0
		if not itemTable:
			self.log.warn("Could not find table?")
			self.log.warn("On page '%s'", listURL)
			return
		for row in itemTable.find_all("tr"):
			itemCount += self.extractRow(row, listName)


		listTotalNo = toInt(soup.find("div", class_="low_col1").text)
		if itemCount != listTotalNo:
			self.log.error("Invalid list reported length! Items from page: %d, found items %d", listTotalNo, itemCount)
		self.conn.commit()
		self.log.info("Properly processed all items in list!")


	#

	def scanRecentlyUpdated(self):
		ONE_DAY = 60*60*24
		releases = self.wgH.getpage(self.baseReleasesURL)
		soup = bs4.BeautifulSoup(releases)

		content = soup.find("td", {"id": "main_content"})
		titles = content.find_all("p", class_="titlesmall")
		for title in titles:
			date = title.get_text()
			date = dateutil.parser.parse(date, fuzzy=True)

			table = title.find_next_sibling("div").table
			for row in table.find_all("tr"):
				link = row.find("a", title="Series Info")

				# Need to skip rows with no links, (they're the table header)
				if link:
					mId = link["href"].split("=")[-1]


					haveRow = self.getRowByValue(buId=mId)
					if haveRow:
						checked = self.getLastCheckedFromId(mId)
						if checked + ONE_DAY < time.time():
							self.log.info("Need to check item for id '%s'", mId)
							self.updateLastCheckedFromId(mId, 0)  # Set last checked to zero, to force the next run to update the item
							# print("Checked, ", checked+ONE_DAY, "time", time.time())
							# print("row", self.getRowByValue(buId=mId))
					else:
						name = link.get_text()
						self.insertBareNameItems([(name, mId)])
						self.log.info("New series! '%s', id '%s'", name, mId)




	# -----------------------------------------------------------------------------------
	# General MangaUpdate mirroring tool (enqueues ALL the manga items!)
	# -----------------------------------------------------------------------------------

	def getSeriesFromPage(self, soup):
		itemTable = soup.find("table", class_="series_rows_table")

		rows = []
		for row in itemTable.find_all("tr"):
			if not row.find("td", class_="text"):
				continue

			tds = row.find_all("td")
			if len(tds) != 4:
				continue

			title, dummy_genre, dummy_year, dummy_rating = tds
			mId = title.a["href"].replace('https://www.mangaupdates.com/series.html?id=', '')
			# print("title", title.get_text(), mId)

			try:
				int(mId)
				rows.append((title.get_text(), mId))
			except ValueError:
				self.log.critical("Could not extract ID? TitleTD = %s", title)


		return rows

	# TODO: Schedule this occationally
	def getAllManga(self):
		urlFormat = 'https://www.mangaupdates.com/series.html?page={page}&perpage=100'
		self.log.info("MU Updater scanning MangaUpdates to get all available manga.")
		run = 456
		while run:
			url = urlFormat.format(page=run)

			run += 1

			soup = self.wgH.getSoup(url)
			series = self.getSeriesFromPage(soup)
			if series:
				self.log.info("Inserting %s items into name DB", len(series))
				self.insertBareNameItems(series)

			if len(series) == 0:
				self.log.info("No items found. At the end of the series list?")
				run = 0
			if not runStatus.run:
				self.log.info( "Breaking due to exit flag being set")
				break

		self.log.info("Completed scanning all manga items.")

示例#27

显示文件

class FeedLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    loggerPath = "Main.Manga.Mp.Fl"
    pluginName = "MangaPark Link Retreiver"
    tableKey = "mp"
    dbName = settings.DATABASE_DB_NAME

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "MangaItems"

    urlBase = "http://mangapark.com/"

    feedUrl = "http://mangapark.com/latest"

    def checkMatureAgree(self, page, url):
        if "This series contains mature contents" in page:
            self.log.info("Need to step through mature agreement page.")
            page = self.wg.getpage(url, postData={"adult": "true"})

        return page

    def getItemPages(self, info):
        url, series = info

        # print("Should get item for ", url)
        page = self.wg.getpage(url)
        page = self.checkMatureAgree(page, url)

        soup = bs4.BeautifulSoup(page, "lxml")
        main = soup.find("section", class_='manga')
        series = main.find("div", class_="hd")

        container = soup.find("div", class_="book-list")

        seriesName = series.get_text().strip()

        if seriesName.endswith(" Manga"):
            seriesName = seriesName[:-1 * len(" Manga")]

        segmentDivs = container.find_all("div",
                                         class_="stream",
                                         recursive=False)

        ret = []

        for segment in segmentDivs:
            chaps = segment.find_all("li", id=re.compile(r"b-\d+"))
            for chap in chaps:
                dlLink = chap.find("a", class_="ch")["href"]
                dlTitle = chap.find("span").get_text().strip()

                dlTitle = dlTitle.replace(
                    ":", " -")  # Can't have colons in filenames
                # print("dlLink", dlLink, dlTitle)

                item = {}
                date = dateutil.parser.parse(chap.i.get_text(), fuzzy=True)

                item["originName"] = "{series} - {file}".format(
                    series=seriesName, file=dlTitle)
                item["sourceUrl"] = dlLink
                item["seriesName"] = seriesName
                item["retreivalTime"] = calendar.timegm(date.timetuple())

                ret.append(item)

        return ret

    def getSeriesUrls(self):
        ret = []
        soup = self.wg.getSoup(self.feedUrl)
        content = soup.find('div', class_='ls1')
        divs = content.find_all("div", class_="item")
        for div in divs:
            # First a in the div is the title image
            url = div.a["href"]
            url = urllib.parse.urljoin(self.urlBase, url)
            text = div.a['title']
            ret.append((url, text))

        return ret

    def getAllItems(self):
        # for item in items:
        # 	self.log.info( item)
        #

        self.log.info("Loading Mc Items")

        ret = []

        seriesPages = self.getSeriesUrls()

        for item in seriesPages:

            itemList = self.getItemPages(item)
            for item in itemList:
                ret.append(item)

            if not runStatus.run:
                self.log.info("Breaking due to exit flag being set")
                break
        self.log.info("Found %s total items", len(ret))
        return ret

    def go(self):

        self.resetStuckItems()
        self.log.info("Getting feed items")

        feedItems = self.getAllItems()
        self.log.info("Processing feed Items")

        self.processLinksIntoDB(feedItems)
        self.log.info("Complete")

示例#28

显示文件

文件： pururinDbLoader.py 项目： gregseb/MangaCMS

class PururinDbLoader(ScrapePlugins.RetreivalDbBase.ScraperDbBase):

    dbName = settings.DATABASE_DB_NAME
    loggerPath = "Main.Manga.Pururin.Fl"
    pluginName = "Pururin Link Retreiver"
    tableKey = "pu"
    urlBase = "http://pururin.com/"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "HentaiItems"

    def loadFeed(self, pageOverride=None):
        self.log.info("Retreiving feed content...", )
        if not pageOverride:
            pageOverride = 1
        try:
            # I really don't get the logic behind Pururin's path scheme.
            if pageOverride > 1:
                urlPath = '/browse/0/1{num1}/{num2}.html'.format(
                    num1=pageOverride - 1, num2=pageOverride)
                pageUrl = urllib.parse.urljoin(self.urlBase, urlPath)
            else:
                # First page is just the bare URL. It /looks/ like they're blocking the root page by direct path.
                pageUrl = self.urlBase

            print("Fetching page at", pageUrl)
            page = self.wg.getpage(pageUrl)
        except urllib.error.URLError:
            self.log.critical("Could not get page from Pururin!")
            self.log.critical(traceback.format_exc())
            return ""

        return page

    def parseLinkLi(self, linkLi):
        ret = {}
        ret["dlName"] = " / ".join(
            linkLi.h2.strings
        )  # Messy hack to replace <br> tags with a ' / "', rather then just removing them.
        ret["pageUrl"] = urllib.parse.urljoin(self.urlBase, linkLi.a["href"])
        return ret

    def getFeed(self, pageOverride=None):
        # for item in items:
        # 	self.log.info(item)
        #

        page = self.loadFeed(pageOverride)

        soup = bs4.BeautifulSoup(page)

        mainSection = soup.find("ul", class_="gallery-list")

        doujinLink = mainSection.find_all("li", class_="gallery-block")

        ret = []
        for linkLi in doujinLink:
            tmp = self.parseLinkLi(linkLi)
            ret.append(tmp)

        return ret

    def processLinksIntoDB(self, linksDict):
        self.log.info("Inserting...")

        newItemCount = 0

        for link in linksDict:

            row = self.getRowsByValue(sourceUrl=link["pageUrl"])
            if not row:
                curTime = time.time()
                self.insertIntoDb(retreivalTime=curTime,
                                  sourceUrl=link["pageUrl"],
                                  originName=link["dlName"],
                                  dlState=0)
                # cur.execute('INSERT INTO fufufuu VALUES(?, ?, ?, "", ?, ?, "", ?);',(link["date"], 0, 0, link["dlLink"], link["itemTags"], link["dlName"]))
                self.log.info("New item: %s",
                              (curTime, link["pageUrl"], link["dlName"]))

        self.log.info("Done")
        self.log.info("Committing...", )
        self.conn.commit()
        self.log.info("Committed")

        return newItemCount

    def go(self):
        self.resetStuckItems()
        dat = self.getFeed()
        self.processLinksIntoDB(dat)

示例#29

显示文件

文件： mcContentLoader.py 项目： gregseb/MangaCMS

class McContentLoader(ScrapePlugins.RetreivalBase.ScraperBase):

    loggerPath = "Main.Manga.Mc.Cl"
    pluginName = "MangaCow Content Retreiver"
    tableKey = "mc"
    dbName = settings.DATABASE_DB_NAME
    tableName = "MangaItems"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    retreivalThreads = 4

    def getImage(self, imageUrl, referrer):

        content, handle = self.wg.getpage(imageUrl,
                                          returnMultiple=True,
                                          addlHeaders={'Referer': referrer})
        if not content or not handle:
            raise ValueError("Failed to retreive image from page '%s'!" %
                             referrer)

        fileN = urllib.parse.unquote(
            urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
        fileN = bs4.UnicodeDammit(fileN).unicode_markup
        self.log.info("retreived image '%s' with a size of %0.3f K", fileN,
                      len(content) / 1000.0)
        return fileN, content

    def getImageUrls(self, baseUrl):

        pageCtnt = self.wg.getpage(baseUrl)
        soup = bs4.BeautifulSoup(pageCtnt)

        selector = soup.find("select", class_="cbo_wpm_pag")

        if not selector:
            raise ValueError("Unable to find contained images on page '%s'" %
                             baseUrl)

        pageNumbers = []
        for value in selector.find_all("option"):
            pageNumbers.append(int(value.get_text()))

        if not pageNumbers:
            raise ValueError("Unable to find contained images on page '%s'" %
                             baseUrl)

        pageUrls = []
        for pageNo in pageNumbers:
            pageUrls.append("{baseUrl}{num}/".format(baseUrl=baseUrl,
                                                     num=pageNo))

        # print("PageUrls", pageUrls)
        imageUrls = []

        for pageUrl in pageUrls:

            pageCtnt = self.wg.getpage(pageUrl)

            soup = bs4.BeautifulSoup(pageCtnt)

            imageContainer = soup.find("div", class_="prw")
            url = imageContainer.img["src"]
            # print("Urls - ", (url, pageUrl))
            imageUrls.append((url, pageUrl))

        return imageUrls

    def getLink(self, link):
        sourceUrl = link["sourceUrl"]
        seriesName = link["seriesName"]
        chapterVol = link["originName"]

        try:
            self.log.info("Should retreive url - %s", sourceUrl)
            self.updateDbEntry(sourceUrl, dlState=1)

            imageUrls = self.getImageUrls(sourceUrl)
            if not imageUrls:
                self.log.critical("Failure on retreiving content at %s",
                                  sourceUrl)
                self.log.critical("Page not found - 404")
                self.updateDbEntry(sourceUrl, dlState=-1)
                return

            self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol)
            dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName)

            if link["flags"] == None:
                link["flags"] = ""

            if newDir:
                self.updateDbEntry(sourceUrl,
                                   flags=" ".join([link["flags"], "haddir"]))
                self.conn.commit()

            chapterName = nt.makeFilenameSafe(chapterVol)

            fqFName = os.path.join(dlPath, chapterName + "[MangaCow].zip")

            loop = 1
            while os.path.exists(fqFName):
                fqFName, ext = os.path.splitext(fqFName)
                fqFName = "%s (%d)%s" % (fqFName, loop, ext)
                loop += 1
            self.log.info("Saving to archive = %s", fqFName)

            images = []
            for imgUrl, referrerUrl in imageUrls:
                imageName, imageContent = self.getImage(imgUrl, referrerUrl)

                images.append([imageName, imageContent])

                if not runStatus.run:
                    self.log.info("Breaking due to exit flag being set")
                    self.updateDbEntry(sourceUrl, dlState=0)
                    return

            self.log.info("Creating archive with %s images", len(images))

            if not images:
                self.updateDbEntry(sourceUrl,
                                   dlState=-1,
                                   seriesName=seriesName,
                                   originName=chapterVol,
                                   tags="error-404")
                return

            #Write all downloaded files to the archive.
            arch = zipfile.ZipFile(fqFName, "w")
            for imageName, imageContent in images:
                arch.writestr(imageName, imageContent)
            arch.close()

            dedupState = processDownload.processDownload(seriesName,
                                                         fqFName,
                                                         deleteDups=True,
                                                         includePHash=True,
                                                         phashThresh=6)
            self.log.info("Done")

            filePath, fileName = os.path.split(fqFName)
            self.updateDbEntry(sourceUrl,
                               dlState=2,
                               downloadPath=filePath,
                               fileName=fileName,
                               seriesName=seriesName,
                               originName=chapterVol,
                               tags=dedupState)
            return

        except Exception:
            self.log.critical("Failure on retreiving content at %s", sourceUrl)
            self.log.critical("Traceback = %s", traceback.format_exc())
            self.updateDbEntry(sourceUrl, dlState=-1)

示例#30

显示文件

class DbLoader(ScrapePlugins.LoaderBase.LoaderBase):

    dbName = settings.DATABASE_DB_NAME
    loggerPath = "Main.Manga.Hitomi.Fl"
    pluginName = "Hitomi Link Retreiver"
    tableKey = "hit"
    urlBase = "https://hitomi.la/"

    wg = webFunctions.WebGetRobust(logPath=loggerPath + ".Web")

    tableName = "HentaiItems"

    def loadFeed(self, pageOverride=None):
        self.log.info("Retreiving feed content...", )
        if not pageOverride:
            pageOverride = 1
        try:

            urlPath = '/index-all-{num}.html'.format(num=pageOverride)
            sourceUrl = urllib.parse.urljoin(self.urlBase, urlPath)

            page = self.wg.getSoup(sourceUrl)
        except urllib.error.URLError:
            self.log.critical("Could not get page from Hitomi!")
            self.log.critical(traceback.format_exc())
            return ""

        return page

    def parseLinkDiv(self, linkdiv):

        if not linkdiv.h1:
            return

        date = linkdiv.find("p", class_="date")
        if not date:
            return

        ret = {}

        for row in linkdiv.find_all("tr"):
            if not len(row("td")) == 2:
                continue
            param, val = row("td")
            param = param.get_text().strip()
            val = val.get_text().strip()

            if param.lower() == "language":

                # Only scrape english TLs and japanese language content.
                # This'll probably miss some other non-japanese content,
                # but they don't seem to have a "translated" tag.
                if val.lower() not in ['english']:
                    self.log.info("Skipping item due to language being %s.",
                                  val)
                    return None

            if param.lower() == "type":
                ret['seriesName'] = val.title()

            # Judge me
            if param.lower() == "tags":
                if "males only" in val.lower(
                ) and not "females only" in val.lower():
                    self.log.info(
                        "Skipping item due to tag 'males only' (%s).",
                        val.replace("\n", " "))
                    return None

        ret["originName"] = linkdiv.h1.get_text().strip()
        ret["sourceUrl"] = urllib.parse.urljoin(self.urlBase,
                                                linkdiv.h1.a["href"])

        pdate = parser.parse(date.get_text())
        ret["retreivalTime"] = calendar.timegm(pdate.utctimetuple())

        return ret

    def getFeed(self, pageOverride=[None]):
        # for item in items:
        # 	self.log.info(item)
        #

        # self.wg.stepThroughCloudFlare("https://hitomi.la/", titleContains="Hitomi.la")

        ret = []

        for x in pageOverride:
            soup = self.loadFeed(x)

            mainSection = soup.find("div", class_="gallery-content")

            doujinLink = mainSection.find_all(
                "div", class_=re.compile("(cg|dj|manga|acg)"), recursive=False)

            for linkLi in doujinLink:
                tmp = self.parseLinkDiv(linkLi)
                if tmp:
                    ret.append(tmp)

        return ret