def scanSingleDir(self, dirPath): self.log.info("Dir %s", dirPath) items = os.listdir(dirPath) items.sort() for item in items: item = os.path.join(dirPath, item) if os.path.isfile(item): fPath, fName = os.path.split(item) guessName = nt.guessSeriesFromFilename(fName) dirName = fPath.strip("/").split("/")[-1] guess2 = nt.guessSeriesFromFilename(dirName) dist = lv.distance(guessName, guess2) # Assumption: The filename probably has shit tacked onto it. # Therefore, the allowable edit distance delta is the extent to # which the filename is longer then the dirname normed = dist - (len(guessName) - len(guess2)) if normed > 0: self.log.warning("Wat: %s", (normed, item, guessName, guess2)) elif normed < 0: self.log.error("Wat: %s", (normed, item, guessName, guess2)) else: if guess2 in nt.dirNameProxy and nt.dirNameProxy[guess2][ "fqPath"]: itemInfo = nt.dirNameProxy[guess2] # print(itemInfo) if itemInfo["fqPath"] != dirPath: dstDir = itemInfo["fqPath"] print("Move file '%s' from:" % fName) print(" Src = '%s'" % fPath) print(" Dst = '%s'" % dstDir) dstPath = os.path.join(dstDir, fName) try: shutil.move(item, dstPath) # Set pron to True, to prevent accidental uploading. processDownload.processDownload( guess2, dstPath, deleteDups=True, includePHash=True, pron=True, crossReference=False) except KeyboardInterrupt: shutil.move(dstPath, item) raise else: print("No match: ", fName)
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] originFileName = link["originName"] self.updateDbEntry(sourceUrl, dlState=1) self.log.info("Downloading = '%s', '%s'", seriesName, originFileName) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() try: content, headerName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return headerName = urllib.parse.unquote(headerName) fName = "%s - %s" % (originFileName, headerName) fName = nt.makeFilenameSafe(fName) fName, ext = os.path.splitext(fName) fName = "%s [CXC Scans]%s" % (fName, ext) fqFName = os.path.join(dlPath, fName) self.log.info("SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName, ext = os.path.splitext(fName) fName = "%s (%d)%s" % (fName, loop, ext) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info("Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return # self.log.info( filePath) dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info("Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, linkDict): try: linkDict = self.getDownloadInfo(linkDict) images = self.getImages(linkDict) title = linkDict['title'] artist = linkDict['artist'] except webFunctions.ContentError: self.updateDbEntry(linkDict["sourceUrl"], dlState=-2, downloadPath="ERROR", fileName="ERROR: FAILED") return False if images and title: fileN = title+" "+artist+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. try: arch = zipfile.ZipFile(wholePath, "w") except OSError: title = title.encode('ascii','ignore').decode('ascii') fileN = title+".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=linkDict['dbId']) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False
def scanSingleDir(self, dirPath): self.log.info("Dir %s", dirPath) items = os.listdir(dirPath) items.sort() for item in items: item = os.path.join(dirPath, item) if os.path.isfile(item): fPath, fName = os.path.split(item) guessName = nt.guessSeriesFromFilename(fName) dirName = fPath.strip("/").split("/")[-1] guess2 = nt.guessSeriesFromFilename(dirName) dist = lv.distance(guessName, guess2) # Assumption: The filename probably has shit tacked onto it. # Therefore, the allowable edit distance delta is the extent to # which the filename is longer then the dirname normed = dist - (len(guessName) - len(guess2)) if normed > 0: self.log.warning("Wat: %s", (normed, item, guessName, guess2)) elif normed < 0: self.log.error("Wat: %s", (normed, item, guessName, guess2)) else: if guess2 in nt.dirNameProxy and nt.dirNameProxy[guess2]["fqPath"]: itemInfo = nt.dirNameProxy[guess2] # print(itemInfo) if itemInfo["fqPath"] != dirPath: dstDir = itemInfo["fqPath"] print("Move file '%s' from:" % fName) print(" Src = '%s'" % fPath) print(" Dst = '%s'" % dstDir) dstPath = os.path.join(dstDir, fName) try: shutil.move(item, dstPath) # Set pron to True, to prevent accidental uploading. processDownload.processDownload(guess2, dstPath, deleteDups=True, includePHash=True, pron=True, crossReference=False) except KeyboardInterrupt: shutil.move(dstPath, item) raise else: print("No match: ", fName)
def getLink(self, link): sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info( "Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() fileUrl = self.getDownloadUrl(sourceUrl) if fileUrl is None: self.log.warning("Could not find url!") self.deleteRowsByValue(sourceUrl=sourceUrl) return try: content, hName = self.getLinkFile(fileUrl, sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info( "SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info( "Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) return #self.log.info( filePath) dedupState = processDownload.processDownload(link["seriesName"], fqFName, deleteDups=True, includePHash=True) self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def doDownload(self, linkDict, retag=False): downloadUrl = self.getDownloadUrl(linkDict["dlPage"], linkDict["sourceUrl"]) if downloadUrl: fCont, fName = self.wg.getFileAndName(downloadUrl) # self.log.info(len(content)) if linkDict["originName"] in fName: fileN = fName else: fileN = "%s - %s.zip" % (linkDict["originName"], fName) fileN = fileN.replace(".zip .zip", ".zip") fileN = nt.makeFilenameSafe(fileN) chop = len(fileN) - 4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop] + fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) # Write all downloaded files to the archive. with open(wholePath, "wb") as fp: fp.write(fCont) self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, pron=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def fetchLinkList(self, itemList): dbInt = utilities.EmptyRetreivalDb.ScraperDbTool() try: for item in itemList: srcStr = 'import-{hash}'.format(hash=hashlib.md5(item.encode("utf-8")).hexdigest()) itemtags = self.extractTags(item) if itemtags == "None" or itemtags == None: itemtags = '' fPath = os.path.join(settings.djMoeDir, "imported") if not os.path.exists(fPath): os.makedirs(fPath) srcPath = os.path.join(self.sourcePath, item) dstPath = os.path.join(fPath, item) if os.path.exists(dstPath): raise ValueError("Destination path already exists? = '%s'" % dstPath) # print("os.path.exists", os.path.exists(srcPath), os.path.exists(dstPath)) # print("Item '%s' '%s' '%s'" % (srcPath, dstPath, itemtags)) shutil.move(srcPath, dstPath) dbInt.insertIntoDb(retreivalTime=200, sourceUrl=srcStr, originName=item, dlState=2, downloadPath=fPath, fileName=item, seriesName="imported") dedupState = processDownload.processDownload("imported", dstPath, pron=True, deleteDups=True) tags = dedupState + ' ' + ' '.join(itemtags) tags = tags.strip() if dedupState: dbInt.addTags(sourceUrl=srcStr, tags=tags) self.log.info( "Done") if not runStatus.run: self.log.info( "Breaking due to exit flag being set") break except: self.log.critical("Exception!") traceback.print_exc() self.log.critical(traceback.format_exc())
def getFile(self, file_data): row = self.getRowsByValue(sourceUrl=file_data["baseUrl"], limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime=time.time(), sourceUrl=file_data["baseUrl"], originName=file_data["title"], dlState=1, seriesName=file_data["title"]) image_links = self.getFileInfo(file_data) images = [] for imagen, imageurl in image_links: imdat = self.get_image(imageurl, file_data['xor_key']) images.append((imagen, imdat)) # filen = nt.makeFilenameSafe(file_data['title'] + " - " + imagen) # with open(filen, "wb") as fp: # fp.write(imdat) fileN = '{series} - c{chapNo:03.0f} [MangaBox].zip'.format( series=file_data['title'], chapNo=file_data['chapter']) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries( file_data["title"]) wholePath = os.path.join(dlPath, fileN) if newDir: self.updateDbEntry(file_data["baseUrl"], flags="haddir") self.conn.commit() arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(file_data["title"], wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=file_data["baseUrl"], tags=dedupState) self.updateDbEntry(file_data["baseUrl"], dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit() self.log.info("Done")
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # images = ['wat'] # print(linkDict) # self.log.info(len(content)) if images: linkDict["chapterNo"] = float(linkDict["chapterNo"]) fileN = '{series} - c{chapNo:06.1f} - {sourceName} [crunchyroll].zip'.format( series=linkDict['seriesName'], chapNo=linkDict["chapterNo"], sourceName=linkDict['originName']) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" dedupState = processDownload.processDownload( linkDict["seriesName"], wholePath, deleteDups=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2, downloadPath=linkDict["dirPath"], fileName=fileN, originName=fileN) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # self.log.info(len(content)) if images: fileN = linkDict['originName']+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getFile(self, file_data): row = self.getRowsByValue(sourceUrl=file_data["baseUrl"], limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime = time.time(), sourceUrl = file_data["baseUrl"], originName = file_data["title"], dlState = 1, seriesName = file_data["title"]) image_links = self.getFileInfo(file_data) images = [] for imagen, imageurl in image_links: imdat = self.get_image(imageurl, file_data['xor_key']) images.append((imagen, imdat)) # filen = nt.makeFilenameSafe(file_data['title'] + " - " + imagen) # with open(filen, "wb") as fp: # fp.write(imdat) fileN = '{series} - c{chapNo:03.0f} [MangaBox].zip'.format(series=file_data['title'], chapNo=file_data['chapter']) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(file_data["title"]) wholePath = os.path.join(dlPath, fileN) if newDir: self.updateDbEntry(file_data["baseUrl"], flags="haddir") self.conn.commit() arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(file_data["title"], wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=file_data["baseUrl"], tags=dedupState) self.updateDbEntry(file_data["baseUrl"], dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit() self.log.info( "Done")
def xdcc_receive_finish(self): self.log.info("XDCC Transfer starting!") self.changeState("xdcc finished") dedupState = processDownload.processDownload(self.currentItem["seriesName"], self.currentItem["downloadPath"], deleteDups=True) self.log.info( "Done") self.db.addTags(dbId=self.currentItem["dbId"], tags=dedupState) if dedupState != "damaged": self.db.updateDbEntry(self.currentItem["sourceUrl"], dlState=2) else: self.db.updateDbEntry(self.currentItem["sourceUrl"], dlState=-10)
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # images = ['wat'] # print(linkDict) # self.log.info(len(content)) if images: linkDict["chapterNo"] = float(linkDict["chapterNo"]) fileN = '{series} - c{chapNo:06.1f} - {sourceName} [crunchyroll].zip'.format(series=linkDict['seriesName'], chapNo=linkDict["chapterNo"], sourceName=linkDict['originName']) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, deleteDups=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2, downloadPath=linkDict["dirPath"], fileName=fileN, originName=fileN) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def doDownload(self, seriesName, dlurl, chapter_name): row = self.getRowsByValue(sourceUrl=dlurl, limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime=time.time(), sourceUrl=dlurl, originName=seriesName, dlState=1, seriesName=seriesName) fctnt, fname = self.wg.getFileAndName(dlurl) fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName, chap=chapter_name) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) wholePath = os.path.join(dlPath, fileN) self.log.info("Source name: %s", fname) self.log.info("Generated name: %s", fileN) if newDir: self.updateDbEntry(dlurl, flags="haddir") self.conn.commit() with open(wholePath, "wb") as fp: fp.write(fctnt) self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(seriesName, wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=dlurl, tags=dedupState) self.updateDbEntry(dlurl, dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit()
def doDownload(self, seriesName, dlurl, chapter_name): row = self.getRowsByValue(sourceUrl=dlurl, limitByKey=False) if row and row[0]['dlState'] != 0: return if not row: self.insertIntoDb(retreivalTime = time.time(), sourceUrl = dlurl, originName = seriesName, dlState = 1, seriesName = seriesName) fctnt, fname = self.wg.getFileAndName(dlurl) fileN = '{series} - {chap} [YoManga].zip'.format(series=seriesName, chap=chapter_name) fileN = nt.makeFilenameSafe(fileN) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) wholePath = os.path.join(dlPath, fileN) self.log.info("Source name: %s", fname) self.log.info("Generated name: %s", fileN) if newDir: self.updateDbEntry(dlurl, flags="haddir") self.conn.commit() with open(wholePath, "wb") as fp: fp.write(fctnt) self.log.info("Successfully Saved to path: %s", wholePath) dedupState = processDownload.processDownload(seriesName, wholePath, deleteDups=True) if dedupState: self.addTags(sourceUrl=dlurl, tags=dedupState) self.updateDbEntry(dlurl, dlState=2, downloadPath=dlPath, fileName=fileN, originName=fileN) self.conn.commit()
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName+"[MangaCow].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl, referrerUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, referrerUrl) images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True, phashThresh=6) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def doDownload(self, linkDict, link, retag=False): downloadUrl = self.getDownloadUrl(linkDict['dlPage'], linkDict["sourceUrl"]) if downloadUrl: fCont, fName = self.wg.getFileAndName(downloadUrl) # self.log.info(len(content)) if linkDict['originName'] in fName: fileN = fName else: fileN = '%s - %s.zip' % (linkDict['originName'], fName) fileN = fileN.replace('.zip .zip', '.zip') fileN = nt.makeFilenameSafe(fileN) chop = len(fileN) - 4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop] + fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. with open(wholePath, "wb") as fp: fp.write(fCont) fileN = os.path.split(wholePath)[-1] self.log.info("Succesfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload( linkDict["seriesName"], wholePath, pron=True, rowId=link['dbId']) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False
def doDownload(self, linkDict, retag=False): images = self.fetchImages(linkDict) # self.log.info(len(content)) if images: fileN = linkDict['originName']+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. chop = len(fileN)-4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop]+fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(linkDict["seriesName"], wholePath, pron=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getLink(self, link): sourceUrl = link["sourceUrl"] print("Link", link) try: self.log.info( "Should retreive url - %s", sourceUrl) # self.updateDbEntry(sourceUrl, dlState=1) imageUrls, meta = self.getImageUrlsInfo(sourceUrl) self.updateDbEntry(sourceUrl, seriesName=meta['series_name']) seriesName = meta['series_name'] link["originName"] = meta['rel_name'] if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") # self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, link["originName"], len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: # self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(link["originName"]) fqFName = os.path.join(dlPath, chapterName+" [Comic Zenon].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] imgCnt = 1 for imgUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, sourceUrl) imageName = "{num:03.0f} - {srcName}".format(num=imgCnt, srcName=imageName) imgCnt += 1 images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") # self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: # self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) # self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) # self.updateDbEntry(sourceUrl, dlState=-1) raise
def getLink(self, link): sourceUrl = link["sourceUrl"] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) seriesName, chapterVol, imageUrls = self.getContainerPages( sourceUrl) if not seriesName and not chapterVol and not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) chapterNameRaw = " - ".join((seriesName, chapterVol)) chapterName = nt.makeFilenameSafe(chapterNameRaw) fqFName = os.path.join(dlPath, chapterName + " [batoto].zip") loop = 1 while os.path.exists(fqFName): fName = "%s [batoto] - (%d).zip" % (chapterName, loop) fqFName = os.path.join(dlPath, fName) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl in imageUrls: self.log.info("Fetching content for item: %s", imgUrl) imageName, imageContent = self.getImage( imgUrl, "http://bato.to/reader") images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterNameRaw, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=False, rowId=link['dbId']) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterNameRaw, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def doDownload(self, linkDict): images = [] containerUrl = linkDict["sourceUrl"] + "/read" if "http://www.fakku.net/videos/" in containerUrl: self.log.warning("Cannot download video items.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-5, downloadPath="Video", fileName="ERROR: Video", lastUpdate=time.time()) return False if "http://www.fakku.net/games/" in containerUrl: self.log.warning("Cannot download game items.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-6, downloadPath="Game", fileName="ERROR: Game", lastUpdate=time.time()) return False try: imagePage = self.wg.getpage( containerUrl, addlHeaders={'Referer': linkDict["sourceUrl"]}) except urllib.error.URLError: self.log.warning("Failure to retreive base page!.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR", lastUpdate=time.time()) return False if "This content has been disabled due to a DMCA takedown notice, it is no longer available to download or read online in your region." in imagePage: self.log.warning( "Assholes have DMCAed this item. Not available anymore.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-4, downloadPath="DMCA", fileName="ERROR: DMCAed", lastUpdate=time.time()) return False if "Content does not exist." in imagePage: self.log.warning("Page removed?.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-7, downloadPath="REMOVED", fileName="ERROR: File removed", lastUpdate=time.time()) return False # F**k you Fakku, don't include pay-content in your free gallery system. if "You must purchase this book in order to read it." in imagePage: self.log.warning("Page removed?.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-7, downloadPath="REMOVED", fileName="ERROR: Paywalled. ", lastUpdate=time.time()) return False # So...... Fakku's reader is completely javascript driven. No (easily) parseable shit here. # Therefore: WE DECEND TO THE LEVEL OF REGEXBOMINATIONS! pathFormatterRe = re.compile( r"return '(//t\.fakku\.net/images/.+/.+/.+?/images/)' \+ x \+ '(\.jpg|\.gif|\.png)';", re.IGNORECASE) # We need to know how many images there are, but there is no convenient way to access this information. # The fakku code internally uses the length of the thumbnail array for the number of images, so # we extract that array, parse it (since it's javascript, variables are JSON, after all), and # just look at the length ourselves as well. thumbsListRe = re.compile(r"window\.params\.thumbs = (\[.+?\]);", re.IGNORECASE) thumbs = thumbsListRe.search(imagePage) pathFormatter = pathFormatterRe.search(imagePage) if not thumbs: self.log.error("Could not find thumbnail array on page!") self.log.error("URL: '%s'", containerUrl) if not pathFormatter: self.log.error("Could not find pathformatter on page!") self.log.error("URL: '%s'", containerUrl) items = json.loads(thumbs.group(1)) prefix, postfix = pathFormatter.group(1), pathFormatter.group(2) print("pathFormatter = ", prefix, prefix) imageUrls = [] for x in range(len(items)): item = '{prefix}{num:03d}{postfix}'.format( prefix=pathFormatter.group(1), num=x + 1, postfix=pathFormatter.group(2)) imageUrls.append(item) # print("Prepared image URLs = ") # print(imageUrls) # print(linkDict) images = [] try: for imageUrl in imageUrls: imagePath = urllib.parse.urlsplit(imageUrl)[2] imageFileName = imagePath.split("/")[-1] if imageUrl.startswith("//"): imageUrl = "https:" + imageUrl imageData = self.wg.getpage( imageUrl, addlHeaders={'Referer': containerUrl}) images.append((imageFileName, imageData)) # Find next page except urllib.error.URLError: self.log.error("Failure retreiving item images.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: Could not retreive images!", lastUpdate=time.time()) self.conn.commit() return False # self.log.info(len(content)) if images: fileN = linkDict["originName"] + ".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED", lastUpdate=time.time()) self.conn.commit() return False
def doDownload(self, linkDict): images = [] title = None nextPage = linkDict["dlLink"] while nextPage: gatewayPage = self.wg.getpage( nextPage, addlHeaders={'Referer': linkDict["sourceUrl"]}) soup = bs4.BeautifulSoup(gatewayPage, "lxml") titleCont = soup.find("div", class_="image-menu") title = titleCont.h1.get_text() title = title.replace("Reading ", "") title, dummy = title.rsplit(" Page ", 1) title = title.strip() imageUrl = soup.find("img", class_="b") imageUrl = urllib.parse.urljoin(self.urlBase, imageUrl["src"]) imagePath = urllib.parse.urlsplit(imageUrl)[2] imageFileName = imagePath.split("/")[-1] imageData = self.wg.getpage(imageUrl, addlHeaders={'Referer': nextPage}) images.append((imageFileName, imageData)) # Find next page nextPageLink = soup.find("a", class_="link-next") if not nextPageLink: nextPage = None elif nextPageLink["href"].startswith( "/finish/"): # Break on the last image. nextPage = None else: nextPage = urllib.parse.urljoin(self.urlBase, nextPageLink["href"]) # self.log.info(len(content)) if images and title: fileN = title + ".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. try: arch = zipfile.ZipFile(wholePath, "w") except OSError: title = title.encode('ascii', 'ignore').decode('ascii') fileN = title + ".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False
def getLink(self, link): sourceUrl = link["sourceUrl"] print("Link", link) seriesName = link['seriesName'] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) seriesName = nt.getCanonicalMangaUpdatesName(seriesName) self.log.info("Downloading = '%s', '%s'", seriesName, link["originName"]) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(link["originName"]) fqFName = os.path.join(dlPath, chapterName+" [MangaHere].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = self.proceduralGetImages(sourceUrl) self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) raise
def getLink(self, link): seriesName = link["seriesName"] seriesName = seriesName.replace("[", "(").replace("]", "(") safeBaseName = nt.makeFilenameSafe(link["seriesName"]) if seriesName in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", seriesName, nt.dirNameProxy[seriesName]['fqPath']) link["targetDir"] = nt.dirNameProxy[seriesName]["fqPath"] else: self.log.info( "Don't have target dir for: %s Using default for: %s, full name = %s", seriesName, link["seriesName"], link["originName"]) targetDir = os.path.join(settings.mkSettings["dirs"]['mDlDir'], safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "newdir"])) self.conn.commit() self.conn.commit() except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "haddir"])) self.conn.commit() sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info( "Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() try: content, hName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info( "SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info( "Writing file") filePath, fileName = os.path.split(fqFName) try: chop = len(fileName)-4 wholePath = "ERROR" while 1: try: fileName = fileName[:chop]+fileName[-4:] # self.log.info("geturl with processing", fileName) wholePath = os.path.join(filePath, fileName) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. with open(wholePath, "wb") as fp: fp.write(content) self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 if chop < 200: raise RuntimeError("Don't know what's going on, but a file truncated too far!") self.log.warn("Truncating file length to %s characters.", chop) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) ext = os.path.splitext(fileName)[-1] imageExts = ["jpg", "png", "bmp"] if not any([ext.endswith(ex) for ex in imageExts]): # We don't want to upload the file we just downloaded, so specify doUpload as false. dedupState = processDownload.processDownload(False, fqFName, deleteDups=True, doUpload=False) else: dedupState = "" self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): sourceUrl = link["sourceUrl"] print("Link", link) seriesName = link['seriesName'] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, link["originName"], len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) chapterName = nt.makeFilenameSafe(link["originName"]) fqFName = os.path.join(dlPath, chapterName + " [KissManga].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] imgCnt = 1 for imgUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, sourceUrl) imageName = "{num:03.0f} - {srcName}".format(num=imgCnt, srcName=imageName) imgCnt += 1 images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True, rowId=link['dbId']) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return except SystemExit: print("SystemExit!") raise except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def doDownload(self, linkDict, link): contentUrl = urllib.parse.urljoin( self.urlBase, "zipf.php?token={token}&hash={hash}".format( token=linkDict["contentId"], hash=linkDict["dlToken"])) print("Fetching: ", contentUrl, " Referer ", linkDict["sourceUrl"]) content, handle = self.wg.getpage(contentUrl, returnMultiple=True, addlHeaders={ 'Referer': linkDict["sourceUrl"], "Host": "doujins.com" }) # self.log.info(len(content)) if handle: # self.log.info("handle = ", handle) # self.log.info("geturl", handle.geturl()) urlFileN = urllib.parse.unquote( urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) urlFileN = bs4.UnicodeDammit(urlFileN).unicode_markup urlFileN.encode("utf-8") # DjMoe is apparently returning "zip.php" for ALL filenames. # Blargh if urlFileN == "zipf.php": urlFileN = ".zip" fileN = "%s%s" % (linkDict["originName"], urlFileN) else: self.log.error("Unknown file extension?") self.log.error("Unknown file extension?") self.log.error("Dict filename = %s", linkDict["originName"]) self.log.error("URL filename = %s", urlFileN) fileN = "%s - %s" % (linkDict["originName"], urlFileN) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) fp = open(wholePath, "wb") fp.write(content) fp.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["contentId"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True, rowId=link['dbId']) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["contentId"], tags=dedupState) self.updateDbEntry(linkDict["contentId"], dlState=2) return wholePath else: self.updateDbEntry(linkDict["contentId"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") # cur.execute('UPDATE djmoe SET downloaded=1 WHERE contentID=?;', (linkDict["contentId"], )) # cur.execute('UPDATE djmoe SET dlPath=?, dlName=?, itemTags=? WHERE contentID=?;', ("ERROR", 'ERROR: FAILED', "N/A", linkDict["contentId"])) # self.log.info("fetchall = ", cur.fetchall()) return False
def getLink(self, link): sourceUrl = link["sourceUrl"] print("Link", link) seriesName = link['seriesName'] try: self.log.info("Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) seriesName = nt.getCanonicalMangaUpdatesName(seriesName) self.log.info("Downloading = '%s', '%s'", seriesName, link["originName"]) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(link["originName"]) fqFName = os.path.join(dlPath, chapterName + " [MangaHere].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = self.proceduralGetImages(sourceUrl) self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True) self.log.info("Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) raise
def doDownload(self, linkDict): contentUrl = urllib.parse.urljoin(self.urlBase, "/zip.php?token=%s" % linkDict["contentId"]) content, handle = self.wg.getpage(contentUrl, returnMultiple=True, addlHeaders={'Referer': linkDict["sourceUrl"]}) # self.log.info(len(content)) if handle: # self.log.info("handle = ", handle) # self.log.info("geturl", handle.geturl()) urlFileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) urlFileN = bs4.UnicodeDammit(urlFileN).unicode_markup urlFileN.encode("utf-8") # DjMoe is apparently returning "zip.php" for ALL filenames. # Blargh if urlFileN == "zip.php": urlFileN = ".zip" fileN = "%s%s" % (linkDict["originName"], urlFileN) else: self.log.error("Unknown file extension?") self.log.error("Unknown file extension?") self.log.error("Dict filename = %s", linkDict["originName"]) self.log.error("URL filename = %s", urlFileN) fileN = "%s - %s" % (linkDict["originName"], urlFileN) fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) fp = open(wholePath, "wb") fp.write(content) fp.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["contentId"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["contentId"], tags=dedupState) self.updateDbEntry(linkDict["contentId"], dlState=2) self.conn.commit() else: self.updateDbEntry(linkDict["contentId"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") # cur.execute('UPDATE djmoe SET downloaded=1 WHERE contentID=?;', (linkDict["contentId"], )) # cur.execute('UPDATE djmoe SET dlPath=?, dlName=?, itemTags=? WHERE contentID=?;', ("ERROR", 'ERROR: FAILED', "N/A", linkDict["contentId"])) # self.log.info("fetchall = ", cur.fetchall()) self.conn.commit()
def doDownload(self, linkDict): images = [] containerUrl = linkDict["sourceUrl"]+"/read" if "http://www.fakku.net/videos/" in containerUrl: self.log.warning("Cannot download video items.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-5, downloadPath="Video", fileName="ERROR: Video", lastUpdate=time.time()) return False if "http://www.fakku.net/games/" in containerUrl: self.log.warning("Cannot download game items.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-6, downloadPath="Game", fileName="ERROR: Game", lastUpdate=time.time()) return False try: imagePage = self.wg.getpage(containerUrl, addlHeaders={'Referer': linkDict["sourceUrl"]}) except urllib.error.URLError: self.log.warning("Failure to retreive base page!.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR", lastUpdate=time.time()) return False if "This content has been disabled due to a DMCA takedown notice, it is no longer available to download or read online in your region." in imagePage: self.log.warning("Assholes have DMCAed this item. Not available anymore.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-4, downloadPath="DMCA", fileName="ERROR: DMCAed", lastUpdate=time.time()) return False if "Content does not exist." in imagePage: self.log.warning("Page removed?.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-7, downloadPath="REMOVED", fileName="ERROR: File removed", lastUpdate=time.time()) return False # F**k you Fakku, don't include pay-content in your free gallery system. if "You must purchase this book in order to read it." in imagePage: self.log.warning("Page removed?.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-7, downloadPath="REMOVED", fileName="ERROR: Paywalled. ", lastUpdate=time.time()) return False # F**k you Fakku, don't include pay-content in your free gallery system. if "Enter your account information below." in imagePage: self.log.warning("Subscription bullshit?.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-7, downloadPath="REMOVED", fileName="ERROR: Paywalled. ", lastUpdate=time.time()) return False # So...... Fakku's reader is completely javascript driven. No (easily) parseable shit here. # Therefore: WE DECEND TO THE LEVEL OF REGEXBOMINATIONS! pathFormatterRe = re.compile(r"return '(https://t\.fakku\.net/images/.+/.+/.+?/images/)' \+ x \+ '(\.jpg|\.gif|\.png)';", re.IGNORECASE) # We need to know how many images there are, but there is no convenient way to access this information. # The fakku code internally uses the length of the thumbnail array for the number of images, so # we extract that array, parse it (since it's javascript, variables are JSON, after all), and # just look at the length ourselves as well. thumbsListRe = re.compile(r"window\.params\.thumbs = (\[.+?\]);", re.IGNORECASE) thumbs = thumbsListRe.search(imagePage) pathFormatter = pathFormatterRe.search(imagePage) if not thumbs: self.log.error("Could not find thumbnail array on page!") self.log.error("URL: '%s'", containerUrl) if not pathFormatter: self.log.error("Could not find pathformatter on page!") self.log.error("URL: '%s'", containerUrl) items = json.loads(thumbs.group(1)) prefix, postfix = pathFormatter.group(1), pathFormatter.group(2) print("pathFormatter = ", prefix, prefix) imageUrls = [] for x in range(len(items)): item = '{prefix}{num:03d}{postfix}'.format(prefix=pathFormatter.group(1), num=x+1, postfix=pathFormatter.group(2)) imageUrls.append(item) # print("Prepared image URLs = ") # print(imageUrls) # print(linkDict) images = [] try: for imageUrl in imageUrls: imagePath = urllib.parse.urlsplit(imageUrl)[2] imageFileName = imagePath.split("/")[-1] if imageUrl.startswith("//"): imageUrl = "https:" + imageUrl imageData = self.wg.getpage(imageUrl, addlHeaders={'Referer': containerUrl}) images.append((imageFileName, imageData)) # Find next page except urllib.error.URLError: self.log.error("Failure retreiving item images.") self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: Could not retreive images!", lastUpdate=time.time()) self.conn.commit() return False # self.log.info(len(content)) if images: fileN = linkDict["originName"]+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED", lastUpdate=time.time()) self.conn.commit() return False
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] sourceUrl = sourceUrl.encode("ascii").decode('ascii') # print("Item:", link) try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) chapterVol, imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("No images found on page!") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"]), originName=chapterVol) # self.conn.commit() self.updateDbEntry(sourceUrl, originName=chapterVol) chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName+"[Sura's Place].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] imgCnt = 1 for imgUrl, referrerUrl in imageUrls: imageName, imageContent = self.getImage(imgUrl, referrerUrl) imageName = "{num:03.0f} - {srcName}".format(num=imgCnt, srcName=imageName) imgCnt += 1 images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def doDownload(self, linkDict, link, retag=False): images = self.fetchImages(linkDict) # self.log.info(len(content)) if images: fileN = linkDict['originName'] + ".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. chop = len(fileN) - 4 wholePath = "ERROR" while 1: try: fileN = fileN[:chop] + fileN[-4:] # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) wholePath = self.insertCountIfFilenameExists(wholePath) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 self.log.warn("Truncating file length to %s characters.", chop) if not linkDict["tags"]: linkDict["tags"] = "" self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload( linkDict["seriesName"], wholePath, pron=True, rowId=link['dbId']) self.log.info("Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") return False
def getLink(self, link): seriesName = link["seriesName"] seriesName = seriesName.replace("[", "(").replace("]", "(") safeBaseName = nt.makeFilenameSafe(link["seriesName"]) if seriesName in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", seriesName, nt.dirNameProxy[seriesName]['fqPath']) link["targetDir"] = nt.dirNameProxy[seriesName]["fqPath"] else: self.log.info( "Don't have target dir for: %s Using default for: %s, full name = %s", seriesName, link["seriesName"], link["originName"]) targetDir = os.path.join(settings.jzSettings["dirs"]['mDlDir'], safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "newdir"])) self.conn.commit() self.conn.commit() except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "haddir"])) self.conn.commit() sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info( "Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() try: content, hName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info( "SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info( "Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) ext = os.path.splitext(fileName)[-1] imageExts = ["jpg", "png", "bmp"] if not any([ext.endswith(ex) for ex in imageExts]): dedupState = processDownload.processDownload(False, fqFName, deleteDups=True) else: dedupState = "" self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info("Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() fileUrl = self.getDownloadUrl(sourceUrl) if fileUrl is None: self.log.warning("Could not find url!") self.deleteRowsByValue(sourceUrl=sourceUrl) return try: content, hName = self.getLinkFile(fileUrl, sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info("SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info("Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) return #self.log.info( filePath) dedupState = processDownload.processDownload(link["seriesName"], fqFName, deleteDups=True, includePHash=True) self.log.info("Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] originFileName = link["originName"] self.updateDbEntry(sourceUrl, dlState=1) self.log.info("Downloading = '%s', '%s'", seriesName, originFileName) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() try: content, headerName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return headerName = urllib.parse.unquote(headerName) fName = "%s - %s" % (originFileName, headerName) fName = nt.makeFilenameSafe(fName) fName, ext = os.path.splitext(fName) fName = "%s [CXC Scans]%s" % (fName, ext) fqFName = os.path.join(dlPath, fName) self.log.info("SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName, ext = os.path.splitext(fName) fName = "%s (%d)%s" % (fName, loop, ext) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info("Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info("Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): seriesName = link["seriesName"] seriesName = seriesName.replace("[", "(").replace("]", "(") safeBaseName = nt.makeFilenameSafe(link["seriesName"]) if seriesName in nt.dirNameProxy: self.log.info("Have target dir for '%s' Dir = '%s'", seriesName, nt.dirNameProxy[seriesName]['fqPath']) link["targetDir"] = nt.dirNameProxy[seriesName]["fqPath"] else: self.log.info( "Don't have target dir for: %s Using default for: %s, full name = %s", seriesName, link["seriesName"], link["originName"]) targetDir = os.path.join(settings.jzSettings["dirs"]['mDlDir'], safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"], flags=" ".join( [link["flags"], "newdir"])) self.conn.commit() self.conn.commit() except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning( "Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"], flags=" ".join([link["flags"], "haddir"])) self.conn.commit() sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info("Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) self.conn.commit() try: content, hName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info("SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info("Writing file") filePath, fileName = os.path.split(fqFName) try: with open(fqFName, "wb") as fp: fp.write(content) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) ext = os.path.splitext(fileName)[-1] imageExts = ["jpg", "png", "bmp"] if not any([ext.endswith(ex) for ex in imageExts]): dedupState = processDownload.processDownload(False, fqFName, deleteDups=True) else: dedupState = "" self.log.info("Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): sourceUrl = link["sourceUrl"] seriesName = link["seriesName"] chapterVol = link["originName"] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) imageUrls = self.getImageUrls(sourceUrl) if not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s' ('%s images)", seriesName, chapterVol, len(imageUrls)) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(chapterVol) fqFName = os.path.join(dlPath, chapterName+"["+self.groupName+"].zip") loop = 1 while os.path.exists(fqFName): fqFName, ext = os.path.splitext(fqFName) fqFName = "%s (%d)%s" % (fqFName, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imageName, imgUrl, referrerUrl in imageUrls: dummy_imageName, imageContent = self.getImage(imgUrl, referrerUrl) images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterVol, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, downloadPath=filePath, fileName=fileName) dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True) self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterVol, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def getLink(self, link): seriesName = link["seriesName"] seriesName = seriesName.replace("[", "(").replace("]", "(") safeBaseName = nt.makeFilenameSafe(link["seriesName"]) if seriesName in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", seriesName, nt.dirNameProxy[seriesName]['fqPath']) link["targetDir"] = nt.dirNameProxy[seriesName]["fqPath"] else: self.log.info( "Don't have target dir for: %s Using default for: %s, full name = %s", seriesName, link["seriesName"], link["originName"]) targetDir = os.path.join(settings.mkSettings["dirs"]['mDlDir'], safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "newdir"])) except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) link["targetDir"] = targetDir self.updateDbEntry(link["sourceUrl"],flags=" ".join([link["flags"], "haddir"])) sourceUrl, originFileName = link["sourceUrl"], link["originName"] self.log.info( "Should retreive: %s, url - %s", originFileName, sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) try: content, hName = self.getLinkFile(sourceUrl) except: self.log.error("Unrecoverable error retreiving content %s", link) self.log.error("Traceback: %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) return # print("Content type = ", type(content)) # And fix %xx crap hName = urllib.parse.unquote(hName) fName = "%s - %s" % (originFileName, hName) fName = nt.makeFilenameSafe(fName) fqFName = os.path.join(link["targetDir"], fName) self.log.info( "SaveName = %s", fqFName) loop = 1 while os.path.exists(fqFName): fName = "%s - (%d) - %s" % (originFileName, loop, hName) fqFName = os.path.join(link["targetDir"], fName) loop += 1 self.log.info( "Writing file") filePath, fileName = os.path.split(fqFName) try: chop = len(fileName) - 4 wholePath = "ERROR" while 1: try: fileName = fileName[:chop]+fileName[-4:] # self.log.info("geturl with processing", fileName) wholePath = os.path.join(filePath, fileName) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. with open(wholePath, "wb") as fp: fp.write(content) self.log.info("Successfully Saved to path: %s", wholePath) break except IOError: chop = chop - 1 if chop < 200: raise RuntimeError("Don't know what's going on, but a file truncated too far!") self.log.warn("Truncating file length to %s characters.", chop) except TypeError: self.log.error("Failure trying to retreive content from source %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=-4, downloadPath=filePath, fileName=fileName) return #self.log.info( filePath) ext = os.path.splitext(fileName)[-1] imageExts = ["jpg", "png", "bmp"] if not any([ext.endswith(ex) for ex in imageExts]): # We don't want to upload the file we just downloaded, so specify doUpload as false. dedupState = processDownload.processDownload(False, fqFName, deleteDups=True, doUpload=False, rowId=link['dbId']) else: dedupState = "" self.log.info( "Done") self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return
def getLink(self, link): sourceUrl = link["sourceUrl"] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) seriesName, chapterVol, imageUrls = self.getContainerPages(sourceUrl) if not seriesName and not chapterVol and not imageUrls: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Page not found - 404") self.updateDbEntry(sourceUrl, dlState=-1) return self.log.info("Downloading = '%s', '%s'", seriesName, chapterVol) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterNameRaw = " - ".join((seriesName, chapterVol)) chapterName = nt.makeFilenameSafe(chapterNameRaw) fqFName = os.path.join(dlPath, chapterName+" [batoto].zip") loop = 1 while os.path.exists(fqFName): fName = "%s - (%d).zip" % (chapterName, loop) fqFName = os.path.join(dlPath, fName) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = [] for imgUrl in imageUrls: self.log.info("Fetching content for item: %s", imgUrl) imageName, imageContent = self.getImage(imgUrl, "http://bato.to/reader") images.append([imageName, imageContent]) if not runStatus.run: self.log.info( "Breaking due to exit flag being set") self.updateDbEntry(sourceUrl, dlState=0) return self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, seriesName=seriesName, originName=chapterNameRaw, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=False) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, seriesName=seriesName, originName=chapterNameRaw, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1)
def doDownload(self, linkDict): images = [] title = None nextPage = linkDict["dlLink"] while nextPage: gatewayPage = self.wg.getpage(nextPage, addlHeaders={'Referer': linkDict["sourceUrl"]}) soup = bs4.BeautifulSoup(gatewayPage, "lxml") titleCont = soup.find("div", class_="image-menu") title = titleCont.h1.get_text() title = title.replace("Reading ", "") title, dummy = title.rsplit(" Page ", 1) title = title.strip() imageUrl = soup.find("img", class_="b") imageUrl = urllib.parse.urljoin(self.urlBase, imageUrl["src"]) imagePath = urllib.parse.urlsplit(imageUrl)[2] imageFileName = imagePath.split("/")[-1] imageData = self.wg.getpage(imageUrl, addlHeaders={'Referer': nextPage}) images.append((imageFileName, imageData)) # Find next page nextPageLink = soup.find("a", class_="link-next") if not nextPageLink: nextPage = None elif nextPageLink["href"].startswith("/finish/"): # Break on the last image. nextPage = None else: nextPage = urllib.parse.urljoin(self.urlBase, nextPageLink["href"]) # self.log.info(len(content)) if images and title: fileN = title+".zip" fileN = nt.makeFilenameSafe(fileN) # self.log.info("geturl with processing", fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) self.log.info("Complete filepath: %s", wholePath) #Write all downloaded files to the archive. try: arch = zipfile.ZipFile(wholePath, "w") except OSError: title = title.encode('ascii','ignore').decode('ascii') fileN = title+".zip" fileN = nt.makeFilenameSafe(fileN) wholePath = os.path.join(linkDict["dirPath"], fileN) arch = zipfile.ZipFile(wholePath, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() self.log.info("Successfully Saved to path: %s", wholePath) self.updateDbEntry(linkDict["sourceUrl"], downloadPath=linkDict["dirPath"], fileName=fileN) # Deduper uses the path info for relinking, so we have to dedup the item after updating the downloadPath and fileN dedupState = processDownload.processDownload(None, wholePath, pron=True, deleteDups=True, includePHash=True) self.log.info( "Done") if dedupState: self.addTags(sourceUrl=linkDict["sourceUrl"], tags=dedupState) self.updateDbEntry(linkDict["sourceUrl"], dlState=2) self.conn.commit() return wholePath else: self.updateDbEntry(linkDict["sourceUrl"], dlState=-1, downloadPath="ERROR", fileName="ERROR: FAILED") self.conn.commit() return False