def consolicateSeriesToSingleDir(): print( "Looking for series directories that can be flattened to a single dir") idLut = nt.MtNamesMapWrapper("buId->fsName") db = DbInterface() for key, luDict in nt.dirNameProxy.iteritems(): # print("Key = ", key) mId = db.getIdFromDirName(key) # Skip cases where we have no match if not mId: continue dups = set() for name in idLut[mId]: cName = nt.prepFilenameForMatching(name) # Skip if it's one of the manga names that falls apart under the directory name cleaning mechanism if not cName: continue if cName in nt.dirNameProxy: dups.add(cName) db.getIdFromDirName(cName) if len(dups) > 1: row = db.getRowByValue(buId=mId) targetName = nt.prepFilenameForMatching(row["buName"]) dest = nt.dirNameProxy[targetName] if luDict["dirKey"] != targetName and dest["fqPath"]: print("baseName = ", row["buName"], ", id = ", mId, ", names = ", dups) print(" URL: https://www.mangaupdates.com/series.html?id=%s" % (mId, )) print(" Dir 1 ", luDict["fqPath"]) print(" Dir 2 ", dest["fqPath"]) dirName = os.path.split(luDict["fqPath"])[-1] dir2Name = os.path.split(dest["fqPath"])[-1] print(" 1: ", dirName, ' ->', nt.getCanonicalMangaUpdatesName(dirName)) print(" 2: ", dir2Name, ' ->', nt.getCanonicalMangaUpdatesName(dir2Name)) print(" 1: ({num} items)".format( num=len(os.listdir(luDict["fqPath"])))) print(" 2: ({num} items)".format( num=len(os.listdir(dest["fqPath"])))) doMove = query_response( "move files ('f' dir 1 -> dir 2. 'r' dir 2 -> dir 1. 'n' do not move)?" ) if doMove == "forward": moveFiles(luDict["fqPath"], dest["fqPath"]) os.rmdir(luDict["fqPath"]) elif doMove == "reverse": moveFiles(dest["fqPath"], luDict["fqPath"]) os.rmdir(dest["fqPath"])
def test_choice(self): print("Verifying directory linking mechanism") print(nt.dirNameProxy["Kurogane"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Kurogane"]["fqPath"])) print(nt.dirNameProxy["Kyoumen no Silhouette"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Kyoumen no Silhouette"]["fqPath"])) print(nt.dirNameProxy["Neko Ane "]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Neko Ane "]["fqPath"])) print(nt.dirNameProxy["Rescue Me"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Rescue Me"]["fqPath"])) print(nt.dirNameProxy["Maken Ki!"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Maken Ki!"]["fqPath"])) print(nt.dirNameProxy[":REverSAL"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy[":REverSAL"]["fqPath"])) print(nt.dirNameProxy["Silva"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Silva"]["fqPath"])) print(nt.dirNameProxy["Kouya ni Kemono Doukokusu"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Kouya ni Kemono Doukokusu"]["fqPath"])) print(nt.dirNameProxy["Koukaku no Regios - Missing Mail"]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Koukaku no Regios - Missing Mail"]["fqPath"])) print(nt.dirNameProxy["Kuraudo (NOUJOU Junichi) "]["fqPath"], nt.getCanonicalMangaUpdatesName(nt.dirNameProxy["Kuraudo (NOUJOU Junichi) "]["fqPath"]))
def consolicateSeriesToSingleDir(): print("Looking for series directories that can be flattened to a single dir") idLut = nt.MtNamesMapWrapper("buId->fsName") db = DbInterface() for key, luDict in nt.dirNameProxy.iteritems(): # print("Key = ", key) mId = db.getIdFromDirName(key) # Skip cases where we have no match if not mId: continue dups = set() for name in idLut[mId]: cName = nt.prepFilenameForMatching(name) # Skip if it's one of the manga names that falls apart under the directory name cleaning mechanism if not cName: continue if cName in nt.dirNameProxy: dups.add(cName) db.getIdFromDirName(cName) if len(dups) > 1: row = db.getRowByValue(buId=mId) targetName = nt.prepFilenameForMatching(row["buName"]) dest = nt.dirNameProxy[targetName] if luDict["dirKey"] != targetName and dest["fqPath"]: print("baseName = ", row["buName"], ", id = ", mId, ", names = ", dups) print(" URL: https://www.mangaupdates.com/series.html?id=%s" % (mId, )) print(" Dir 1 ", luDict["fqPath"]) print(" Dir 2 ", dest["fqPath"]) dirName = os.path.split(luDict["fqPath"])[-1] dir2Name = os.path.split(dest["fqPath"])[-1] print(" 1: ", dirName, ' ->', nt.getCanonicalMangaUpdatesName(dirName)) print(" 2: ", dir2Name, ' ->', nt.getCanonicalMangaUpdatesName(dir2Name)) print(" 1: ({num} items)".format(num=len(os.listdir(luDict["fqPath"])))) print(" 2: ({num} items)".format(num=len(os.listdir(dest["fqPath"])))) doMove = query_response("move files ('f' dir 1 -> dir 2. 'r' dir 2 -> dir 1. 'n' do not move)?") if doMove == "forward": moveFiles(luDict["fqPath"], dest["fqPath"]) os.rmdir(luDict["fqPath"]) elif doMove == "reverse": moveFiles(dest["fqPath"], luDict["fqPath"]) os.rmdir(dest["fqPath"])
def getChaptersFromSeriesPage(self, inUrl): soup = self.wg.getSoup(inUrl) if 'The following content is intended for mature' in soup.get_text(): self.log.info("Adult check page. Confirming...") soup = self.wg.getSoup(inUrl, postData={"adult": "true"}) mainDiv = soup.find('div', id='series_right') seriesName = mainDiv.h1.get_text() seriesName = nt.getCanonicalMangaUpdatesName(seriesName) # No idea why chapters are class 'staff_link'. Huh. chapters = mainDiv.find_all('div', class_='staff_link') ret = [] for chapter in chapters: item = {} item['originName'] = "{series} - {file}".format(series=seriesName, file=chapter.a.get_text()) item['sourceUrl'] = chapter.a['href'] item['seriesName'] = seriesName item['retreivalTime'] = time.time() # Fukkit, just use the current date. ret.append(item) return ret
def process_tree_elements(self, elements, cum_path="/"): ret = [] for element in elements: if element['type'] == "report": continue elif element['type'] == 'directory': item_path = os.path.join(cum_path, element['name']) ret.extend(self.process_tree_elements(element['contents'], item_path)) elif element['type'] == 'file': item_path = os.path.join(cum_path, element['name']) if any([item_path.startswith(prefix) for prefix in MASK_PATHS]): continue # Parse out the series name if we're in a directory we understand, # otherwise just assume the dir name is the series. match = re.search(r'/Manga/[^/]/[^/]{2}/[^/]{4}/([^/]+)/', item_path) if match: sname = match.group(1) else: sname = os.path.split(cum_path)[-1] item = { 'source_id' : urllib.parse.urljoin(self.url_base, item_path), 'origin_name' : element['name'], 'series_name' : nt.getCanonicalMangaUpdatesName(sname), } ret.append(item) else: self.log.error("Unknown element type: '%s'", element) return ret
def _processLinksIntoDB(self, linksDicts): self.log.info("Inserting...", ) newItems = 0 for link in linksDicts: if link is None: print("linksDicts", linksDicts) print("WAT") continue row = self.getRowsByValue(sourceUrl=link["sourceUrl"], limitByKey=False) if not row: newItems += 1 if not "dlState" in link: link['dlState'] = 0 # Patch series name. if 'seriesName' in link and self.shouldCanonize: link["seriesName"] = nt.getCanonicalMangaUpdatesName( link["seriesName"]) self.insertIntoDb(**link) self.log.info("New item: %s", link) if self.mon_con: self.mon_con.incr('new_links', newItems) self.log.info("Done (%s new items)", newItems) return newItems
def updateDbEntryById(self, rowId=None, dbId=None, commit=True, cur=None, **kwargs): if dbId is None: assert rowId is not None dbId = rowId # Patch series name. if "seriesName" in kwargs and kwargs[ "seriesName"] and self.shouldCanonize: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName( kwargs["seriesName"]) query, queryArguments = self.generateUpdateQuery(dbId=dbId, **kwargs) if self.QUERY_DEBUG: print("Query = ", query) print("Args = ", queryArguments) if cur: cur.execute(query, queryArguments) else: with self.transaction(commit=commit) as cur: cur.execute(query, queryArguments) print("ret =", cur.rowcount)
def loadRemoteDirectory(self, fullPath, aggregate=False): ret = {} for dirName, stats in self.ftp.mlsd(fullPath): # Skip items that aren't directories if stats["type"]!="dir": continue canonName = nt.getCanonicalMangaUpdatesName(dirName) matchingName = nt.prepFilenameForMatching(canonName) fqPath = os.path.join(fullPath, dirName) # matchName = os.path.split(ret[matchingName])[-1] if matchingName in ret: # if aggregate: # fqPath = self.aggregateDirs(fullPath, dirName, matchName) # else: if COMPLAIN_ABOUT_DUPS: self.log.warning("Duplicate directories for series '%s'!", canonName) self.log.warning(" '%s'", dirName) self.log.warning(" '%s'", matchingName) ret[matchingName] = fqPath else: ret[matchingName] = fqPath return ret
def getUploadDirectory(self, seriesName): ulDir = self.getExistingDir(seriesName) if not ulDir: seriesName = nt.getCanonicalMangaUpdatesName(seriesName) safeFilename = nt.makeFilenameSafe(seriesName) matchName = nt.prepFilenameForMatching(seriesName) matchName = matchName.encode('utf-8', 'ignore').decode('utf-8') self.checkInitDirs() if matchName in self.mainDirs: ulDir = self.mainDirs[matchName][0] elif seriesName in self.mainDirs: ulDir = self.mainDirs[seriesName][0] else: self.log.info("Need to create container directory for %s", seriesName) ulDir = os.path.join(settings.mkSettings["uploadContainerDir"], settings.mkSettings["uploadDir"], safeFilename) try: self.sftp.mkdir(ulDir) except OSError as e: # If the error is just a "directory exists" warning, ignore it silently if str(e) == 'OSError: File already exists': pass else: self.log.warn("Error creating directory?") self.log.warn(traceback.format_exc()) return ulDir
def getUploadDirectory(self, seriesName): ulDir = self.getExistingDir(seriesName) if not ulDir: seriesName = nt.getCanonicalMangaUpdatesName(seriesName) safeFilename = nt.makeFilenameSafe(seriesName) matchName = nt.prepFilenameForMatching(seriesName) matchName = matchName.encode('latin-1', 'ignore').decode('latin-1') self.checkInitDirs() if matchName in self.unsortedDirs: ulDir = self.unsortedDirs[matchName] elif safeFilename in self.unsortedDirs: ulDir = self.unsortedDirs[safeFilename] else: self.log.info("Need to create container directory for %s", seriesName) ulDir = os.path.join(settings.mkSettings["uploadContainerDir"], settings.mkSettings["uploadDir"], safeFilename) try: self.ftp.mkd(ulDir) except ftplib.error_perm as e: # If the error is just a "directory exists" warning, ignore it silently if str(e).startswith("550") and str(e).endswith('File exists'): pass else: self.log.warn("Error creating directory?") self.log.warn(traceback.format_exc()) return ulDir
def getDoujinshiUploadDirectory(self, seriesName): ulDir = self.getExistingDir(seriesName) if not ulDir: seriesName = nt.getCanonicalMangaUpdatesName(seriesName) safeFilename = nt.makeFilenameSafe(seriesName) matchName = nt.prepFilenameForMatching(seriesName) matchName = matchName.encode('latin-1', 'ignore').decode('latin-1') self.checkInitDirs() if matchName in self.unsortedDirs: ulDir = self.unsortedDirs[matchName] elif safeFilename in self.unsortedDirs: ulDir = self.unsortedDirs[safeFilename] else: self.log.info("Need to create container directory for %s", seriesName) ulDir = os.path.join(settings.mkSettings["uploadContainerDir"], settings.mkSettings["uploadDir"], safeFilename) try: self.sftp.mkdir(ulDir) except ftplib.error_perm: self.log.warn("Directory exists?") self.log.warn(traceback.format_exc()) return ulDir
def locateOrCreateDirectoryForSeries(self, seriesName): if self.shouldCanonize and self.is_manga: canonSeriesName = nt.getCanonicalMangaUpdatesName(seriesName) else: canonSeriesName = seriesName safeBaseName = nt.makeFilenameSafe(canonSeriesName) targetDir = os.path.join(settings.mkSettings["dirs"]['bookDir'], safeBaseName) if not os.path.exists(targetDir): self.log.info("Don't have target dir for: %s, full name = %s", canonSeriesName, seriesName) try: os.makedirs(targetDir) return targetDir, True except FileExistsError: # Probably means the directory was concurrently created by another thread in the background? self.log.critical("Directory doesn't exist, and yet it does?") self.log.critical(traceback.format_exc()) except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.info("Directory exists.") self.log.info("Directory not found in dir-dict, but it exists!") self.log.info("Directory-Path: %s", targetDir) self.log.info("Base series name: %s", seriesName) self.log.info("Canonized series name: %s", canonSeriesName) self.log.info("Safe canonized name: %s", safeBaseName) return targetDir, False
def updateDbEntry(self, sourceUrl, commit=True, **kwargs): cur = kwargs.pop('cur', None) # Patch series name. if "seriesName" in kwargs and kwargs[ "seriesName"] and self.shouldCanonize: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName( kwargs["seriesName"]) # Clamp the retreivaltime to now, so parsing issues that result in invalid, future # time-stamps don't cause posts to stick to the top of the post list. if 'retreivalTime' in kwargs: if kwargs['retreivalTime'] > time.time(): kwargs['retreivalTime'] = time.time() query, queryArguments = self.generateUpdateQuery(sourceUrl=sourceUrl, **kwargs) if self.QUERY_DEBUG: print("Query = ", query) print("Args = ", queryArguments) if cur is not None: cur.execute(query, queryArguments) else: with self.transaction(commit=commit) as cur: cur.execute(query, queryArguments)
def updateSeriesDbEntryById(self, rowId, commit=True, **kwargs): # Patch series name. if "seriesName" in kwargs and kwargs["seriesName"]: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName(kwargs["seriesName"]) queries = [] qArgs = [] for key in kwargs.keys(): if key not in self.validSeriesKwargs: raise ValueError("Invalid keyword argument: %s" % key) else: queries.append("{k}=%s".format(k=key)) qArgs.append(kwargs[key]) qArgs.append(rowId) column = ", ".join(queries) query = '''UPDATE {tableName} SET {v} WHERE dbId=%s;'''.format(tableName=self.seriesTableName, v=column) if QUERY_DEBUG: print("Query = ", query) print("Args = ", qArgs) with self.conn.cursor() as cur: if commit: cur.execute("BEGIN;") cur.execute(query, qArgs) if commit: cur.execute("COMMIT;")
def aggregateDirs(self, pathBase_1, pathBase_2, dir1, dir2): canonName = nt.getCanonicalMangaUpdatesName(dir1) canonNameAlt = nt.getCanonicalMangaUpdatesName(dir2) cname1 = nt.prepFilenameForMatching(canonName) cname2 = nt.prepFilenameForMatching(canonNameAlt) if canonName.lower() != canonNameAlt.lower(): self.log.critical( "Error in uploading file. Name lookup via MangaUpdates table not commutative!" ) self.log.critical("First returned value '%s'", canonName) self.log.critical("For directory with path '%s'", dir1) self.log.critical("Second returned value '%s'", canonNameAlt) self.log.critical("For directory with path '%s'", dir2) self.log.critical("After cleaning: '%s', '%s', equal: '%s'", cname1, cname2, cname1 == cname2) raise CanonMismatch("Identical and yet not? '%s' - '%s'" % (canonName, canonNameAlt)) self.log.info("Aggregating directories for canon name '%s':", canonName) n1 = lv.distance(dir1, canonName) n2 = lv.distance(dir2, canonName) self.log.info(" %s - '%s'", n1, dir1) self.log.info(" %s - '%s'", n2, dir2) # I'm using less then or equal, so situations where # both names are equadistant get aggregated anyways. if n1 <= n2: src = os.path.join(pathBase_2, dir2) dst = os.path.join(pathBase_1, dir1) else: src = os.path.join(pathBase_1, dir1) dst = os.path.join(pathBase_2, dir2) self.moveItemsInDir(src, dst) self.log.info("Removing directory '%s'", src) try: self.sftp.mkdir("/Admin cleanup/autoclean dirs") except: pass self.sftp.rename( src, "/Admin cleanup/autoclean dirs/garbage dir %s" % src.replace("/", ";").replace(" ", "_")) return dst
def getItemsFromContainer(self, dirName, dirUrl): # Skip the needs sorting directory. if dirName == 'Needs sorting': return [], [] if dirName == 'Admin Cleanup': return [], [] if dirName == 'Raws': return [], [] if dirName == 'Requests': return [], [] if dirName == '_Autouploads': return [], [] self.log.info("Original name - %s", dirName) bracketStripRe = re.compile(r"(\[.*?\])") dirName = bracketStripRe.sub(" ", dirName) while dirName.find(" ")+1: dirName = dirName.replace(" ", " ") dirName = dirName.strip() if not dirName: self.log.critical("Empty dirname = '%s', baseURL = '%s'", dirName, dirUrl) raise ValueError("No dir name for directory!") dirName = nt.getCanonicalMangaUpdatesName(dirName) self.log.info("Canonical name - %s", dirName) self.log.info("Fetching items for directory '%s'", dirName) self.log.info("Using URL '%s'", dirUrl) try: itemPage = self.wg.getpage(dirUrl) except urllib.error.URLError: self.log.error("Could not fetch page '%s'", dirUrl) return [], [] soup = bs4.BeautifulSoup(itemPage) itemRet = [] dirRet = [] for row in soup.find_all("tr"): dirDat, itemDat = self.parseRow(row, dirUrl, dirName) if dirDat: dirRet.append(dirDat) if itemDat: itemRet.append(itemDat) return dirRet, itemRet
def extractFilename(self, inString): title, dummy_blurb = inString.rsplit("|", 1) # title, chapter = title.rsplit("-", 1) # Unescape htmlescaped items in the name/chapter ps = html.parser.HTMLParser() title = ps.unescape(title) vol = None chap = None volChap = None try: if " vol " in title.lower(): title, volChap = title.rsplit(" vol ", 1) vol, dummy = volChap.strip().split(" ", 1) except ValueError: self.log.error("Could not parse volume number from title %s", title) traceback.print_exc() try: if volChap and " ch " in volChap: dummy, chap = volChap.rsplit(" ch ", 1) elif " ch " in title: title, chap = title.rsplit(" ch ", 1) except ValueError: self.log.error("Could not parse chapter number from title %s", title) traceback.print_exc() if chap: if "Page" in chap: chap, dummy = chap.split("Page", 1) elif title and "Page" in title: title, dummy = title.split("Page", 1) title = title.rstrip(" -") # haveLookup = nt.haveCanonicalMangaUpdatesName(title) # if not haveLookup: # self.log.warning("Did not find title '%s' in MangaUpdates database!", title) title = nt.getCanonicalMangaUpdatesName(title).strip() volChap = [] if vol: volChap.append("v{}".format(vol)) if chap: volChap.append("c{}".format(chap)) chapter = " ".join(volChap) return title, chapter.strip()
def two_arg_lookup(val): print("Passed name = '%s'" % val) import nameTools as nt haveLookup = nt.haveCanonicalMangaUpdatesName(val) if not haveLookup: print("Item not found in MangaUpdates name synonym table") print("Processed item as searched = '%s'" % nt.prepFilenameForMatching(val)) else: print("Item found in lookup table!") print("Canonical name = '%s'" % nt.getCanonicalMangaUpdatesName(val) )
def getItemsFromContainer(self, dirName, dirUrl): # Skip the needs sorting directory. if dirName == 'Needs sorting': return [], [] if dirName == 'Admin Cleanup': return [], [] if dirName == 'Raws': return [], [] if dirName == 'Requests': return [], [] if dirName == '_Autouploads': return [], [] self.log.info("Original name - %s", dirName) bracketStripRe = re.compile(r"(\[.*?\])") dirName = bracketStripRe.sub(" ", dirName) while dirName.find(" ") + 1: dirName = dirName.replace(" ", " ") dirName = dirName.strip() if not dirName: self.log.critical("Empty dirname = '%s', baseURL = '%s'", dirName, dirUrl) raise ValueError("No dir name for directory!") dirName = nt.getCanonicalMangaUpdatesName(dirName) self.log.info("Canonical name - %s", dirName) self.log.info("Fetching items for directory '%s'", dirName) self.log.info("Using URL '%s'", dirUrl) try: itemPage = self.wg.getpage(dirUrl) except urllib.error.URLError: self.log.error("Could not fetch page '%s'", dirUrl) return [], [] soup = bs4.BeautifulSoup(itemPage) itemRet = [] dirRet = [] for row in soup.find_all("tr"): dirDat, itemDat = self.parseRow(row, dirUrl, dirName) if dirDat: dirRet.append(dirDat) if itemDat: itemRet.append(itemDat) return dirRet, itemRet
def aggregateDirs(self, pathBase_1, pathBase_2, dir1, dir2): canonName = nt.getCanonicalMangaUpdatesName(dir1) canonNameAlt = nt.getCanonicalMangaUpdatesName(dir2) cname1 = nt.prepFilenameForMatching(canonName) cname2 = nt.prepFilenameForMatching(canonNameAlt) if canonName.lower() != canonNameAlt.lower(): self.log.critical("Error in uploading file. Name lookup via MangaUpdates table not commutative!") self.log.critical("First returned value '%s'", canonName) self.log.critical("For directory with path '%s'", dir1) self.log.critical("Second returned value '%s'", canonNameAlt) self.log.critical("For directory with path '%s'", dir2) self.log.critical("After cleaning: '%s', '%s', equal: '%s'", cname1, cname2, cname1 == cname2) raise CanonMismatch("Identical and yet not? '%s' - '%s'" % (canonName, canonNameAlt)) self.log.info("Aggregating directories for canon name '%s':", canonName) n1 = lv.distance(dir1, canonName) n2 = lv.distance(dir2, canonName) self.log.info(" %s - '%s'", n1, dir1) self.log.info(" %s - '%s'", n2, dir2) # I'm using less then or equal, so situations where # both names are equadistant get aggregated anyways. if n1 <= n2: src = os.path.join(pathBase_2, dir2) dst = os.path.join(pathBase_1, dir1) else: src = os.path.join(pathBase_1, dir1) dst = os.path.join(pathBase_2, dir2) self.moveItemsInDir(src, dst) self.log.info("Removing directory '%s'", src) try: self.sftp.mkdir("/Admin cleanup/autoclean dirs") except: pass self.sftp.rename(src, "/Admin cleanup/autoclean dirs/garbage dir %s" % src.replace("/", ";").replace(" ", "_")) return dst
def test_choice(self): print("Verifying directory linking mechanism") print( nt.dirNameProxy["Kurogane"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Kurogane"]["fqPath"])) print( nt.dirNameProxy["Kyoumen no Silhouette"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Kyoumen no Silhouette"]["fqPath"])) print( nt.dirNameProxy["Neko Ane "]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Neko Ane "]["fqPath"])) print( nt.dirNameProxy["Rescue Me"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Rescue Me"]["fqPath"])) print( nt.dirNameProxy["Maken Ki!"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Maken Ki!"]["fqPath"])) print( nt.dirNameProxy[":REverSAL"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy[":REverSAL"]["fqPath"])) print( nt.dirNameProxy["Silva"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Silva"]["fqPath"])) print( nt.dirNameProxy["Kouya ni Kemono Doukokusu"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Kouya ni Kemono Doukokusu"]["fqPath"])) print( nt.dirNameProxy["Koukaku no Regios - Missing Mail"]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Koukaku no Regios - Missing Mail"]["fqPath"])) print( nt.dirNameProxy["Kuraudo (NOUJOU Junichi) "]["fqPath"], nt.getCanonicalMangaUpdatesName( nt.dirNameProxy["Kuraudo (NOUJOU Junichi) "]["fqPath"]))
def aggregateDirs(self, pathBase, dir1, dir2): canonName = nt.getCanonicalMangaUpdatesName(dir1) canonNameAlt = nt.getCanonicalMangaUpdatesName(dir2) if canonName.lower() != canonNameAlt.lower(): self.log.critical( "Error in uploading file. Name lookup via MangaUpdates table not commutative!" ) self.log.critical("First returned value '%s'", canonName) self.log.critical("For directory with path '%s'", dir1) self.log.critical("Second returned value '%s'", canonNameAlt) self.log.critical("For directory with path '%s'", dir2) raise ValueError("Identical and yet not? '%s' - '%s'" % (canonName, canonNameAlt)) self.log.info("Aggregating directories for canon name '%s':", canonName) n1 = lv.distance(dir1, canonName) n2 = lv.distance(dir2, canonName) self.log.info(" %s - '%s'", n1, dir1) self.log.info(" %s - '%s'", n2, dir2) # I'm using less then or equal, so situations where # both names are equadistant get aggregated anyways. if n1 <= n2: src = dir2 dst = dir1 else: src = dir1 dst = dir2 src = os.path.join(pathBase, src) dst = os.path.join(pathBase, dst) self.moveItemsInDir(src, dst) self.log.info("Removing directory '%s'", src) # self.ftp.rmd(src) # self.ftp.rename(src, "/Admin Cleanup/garbage dir %s" % id(src)) return dst
def get_link(self, link_row_id): with self.row_context(dbid=link_row_id) as row: series_name = row.series_name chapter_name = row.origin_name source_url = row.source_id row.state = 'fetching' try: self.log.info("Downloading = '%s', '%s'", series_name, chapter_name) file_contents, name_from_source = self.wg.getFileAndName( source_url, addlHeaders={'Referer': 'https://mangazuki.co/'}) series_name = nt.getCanonicalMangaUpdatesName(series_name) dlPath, newDir = self.locateOrCreateDirectoryForSeries(series_name) if name_from_source.endswith(".zip"): name_from_source = name_from_source[:-4] fname = "{} - {} [MangaZuki].zip".format(chapter_name, name_from_source) fqFName = os.path.join(dlPath, fname) # This call also inserts the file parameters into the row with self.row_sess_context(dbid=link_row_id) as row_tup: row, sess = row_tup row.dirstate = "had_dir" if newDir is False else 'created_dir' fqFName = self.save_archive(row, sess, fqFName, file_contents) with self.row_context(dbid=link_row_id) as row: row.state = 'processing' # We don't want to upload the file we just downloaded, so specify doUpload as false. # As a result of this, the seriesName paramerer also no longer matters self.processDownload(seriesName=False, archivePath=fqFName, doUpload=False) self.log.info("Done") with self.row_context(dbid=link_row_id) as row: row.state = 'complete' row.downloaded_at = datetime.datetime.now() row.last_checked = datetime.datetime.now() except Exception: self.log.critical("Failure on retrieving content at %s", source_url) self.log.critical("Traceback = %s", traceback.format_exc()) with self.row_context(dbid=link_row_id) as row: row.state = 'error' row.err_str = traceback.format_exc() raise
def getSeries(self, markup): soup = bs4.BeautifulSoup(markup, "lxml") title = soup.find("h3", id='chapter-title') if title.b.find('a'): title = title.b.a.get_text() else: title = title.b.get_text() title = nt.getCanonicalMangaUpdatesName(title) print("Title '%s'" % title) return title
def updateDbEntryById(self, rowId, commit=True, **kwargs): # Patch series name. if "seriesName" in kwargs and kwargs["seriesName"] and self.shouldCanonize: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName(kwargs["seriesName"]) query, queryArguments = self.generateUpdateQuery(dbId=rowId, **kwargs) if self.QUERY_DEBUG: print("Query = ", query) print("Args = ", queryArguments) with self.transaction(commit=commit) as cur: cur.execute(query, queryArguments)
def get_link(self, link_row_id): with self.row_context(dbid=link_row_id) as row: series_name = row.series_name chapter_name = row.origin_name source_url = row.source_id row.state = 'fetching' series_name = nt.getCanonicalMangaUpdatesName(series_name) self.log.info("Should retreive url - %s", source_url) images = self.proceduralGetImages(source_url) if not images: self.log.critical("Failure on retrieving content at %s", source_url) self.log.critical("Page not found - 404") with self.row_context(dbid=link_row_id) as row: row.state = 'error' row.err_str = "error-404" return imgCnt = 1 for imageName, imageContent in images: imageName = "{num:03.0f} - {srcName}".format(num=imgCnt, srcName=imageName) imgCnt += 1 images.append([imageName, imageContent]) if not runStatus.run: self.log.info("Breaking due to exit flag being set") with self.row_context(dbid=link_row_id) as row: row.state = 'new' return if not images: self.log.error("No images! Download failed?") with self.row_context(dbid=link_row_id) as row: row.state = 'error' row.err_str = "error-404" return self.save_manga_image_set(link_row_id, series_name, chapter_name, images, source_name='MangaHere')
def updateDbEntry(self, sourceUrl, commit=True, **kwargs): # Patch series name. if "seriesName" in kwargs and kwargs["seriesName"] and self.shouldCanonize: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName(kwargs["seriesName"]) query, queryArguments = self.generateUpdateQuery(sourceUrl=sourceUrl, **kwargs) if self.QUERY_DEBUG: print("Query = ", query) print("Args = ", queryArguments) with self.conn.cursor() as cur: with transaction(cur, commit=commit): cur.execute(query, queryArguments)
def updateDbEntryById(self, rowId, commit=True, **kwargs): # Patch series name. if "seriesName" in kwargs and kwargs["seriesName"] and self.shouldCanonize: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName(kwargs["seriesName"]) query, queryArguments = self.generateUpdateQuery(dbId=rowId, **kwargs) if self.QUERY_DEBUG: print("Query = ", query) print("Args = ", queryArguments) with self.conn.cursor() as cur: with transaction(cur, commit=commit): cur.execute(query, queryArguments)
def aggregateDirs(self, pathBase, dir1, dir2): canonName = nt.getCanonicalMangaUpdatesName(dir1) canonNameAlt = nt.getCanonicalMangaUpdatesName(dir2) if canonName != canonNameAlt: self.log.critical("Error in uploading file. Name lookup via MangaUpdates table not commutative!") self.log.critical("First returned value '%s'", canonName) self.log.critical("For directory with path '%s'", dir1) self.log.critical("Second returned value '%s'", canonNameAlt) self.log.critical("For directory with path '%s'", dir2) raise ValueError("Identical and yet not?") self.log.info("Aggregating directories for canon name '%s':", canonName) n1 = lv.distance(dir1, canonName) n2 = lv.distance(dir2, canonName) self.log.info(" %s - '%s'", n1, dir1) self.log.info(" %s - '%s'", n2, dir2) # I'm using less then or equal, so situations where # both names are equadistant get aggregated anyways. if n1 <= n2: src = dir2 dst = dir1 else: src = dir1 dst = dir2 src = os.path.join(pathBase, src) dst = os.path.join(pathBase, dst) self.moveItemsInDir(src, dst) self.log.info("Removing directory '%s'", src) self.ftp.rmd(src) return dst
def _process_links_into_db(self, linksDicts): self.log.info( "Inserting...") newItems = 0 with self.db.session_context() as sess: for link in linksDicts: self._check_keys(link) tags = link.pop("tags", []) assert isinstance(tags, (list, tuple)), "tags must be a list or tuple!" if 'series_name' in link and self.shouldCanonize: link["series_name"] = nt.getCanonicalMangaUpdatesName(link["series_name"]) have = sess.query(self.target_table) \ .filter(self.target_table.source_site == self.plugin_key) \ .filter(self.target_table.source_id == link["source_id"]) \ .scalar() if not have: newItems += 1 have = self.target_table( state = 'new', # Should be set automatically. source_site = self.plugin_key, first_seen = datetime.datetime.now(), **link ) sess.add(have) if newItems % 10000 == 0: self.log.info("Added %s rows, doing incremental commit!", newItems) sess.commit() try: self.update_tags(tags=tags, row=have) except ScrapeExceptions.UnwantedContentError: self.log.info("How does something have masked tags on insertion?") sess.delete(have) if self.mon_con: self.mon_con.incr('new_links', newItems) self.log.info( "Done (%s new items, %s total)", newItems, len(linksDicts)) return newItems
def getFeed(self): treedata = self.wg.getJson(self.tree_api) assert 'contents' in treedata assert treedata['name'] == 'mango' assert treedata['type'] == 'directory' data_unfiltered = self.process_tree_elements(treedata['contents']) data = [] for sName, filen in data_unfiltered: if not any([filen.startswith(prefix) for prefix in MASK_PATHS]): assert filen.startswith(STRIP_PREFIX) filen = filen[len(STRIP_PREFIX):] sName = nt.getCanonicalMangaUpdatesName(sName) data.append((sName, filen)) return data
def getFeed(self): treedata = self.wg.getJson(self.tree_api) assert 'contents' in treedata assert treedata['name'] == 'mango' assert treedata['type'] == 'directory' data_unfiltered = self.process_tree_elements(treedata['contents']) data = [] for sName, filen in data_unfiltered: assert filen.startswith(STRIP_PREFIX) filen = filen[len(STRIP_PREFIX):] if not any([filen.startswith(prefix) for prefix in MASK_PATHS]): sName = nt.getCanonicalMangaUpdatesName(sName) data.append((sName, filen)) return data
def getDownloadPath(self, item, fName): if not item['seriesName']: self.log.info("No series set for item. Guessing from filename:") self.log.info("Filename = '%s'", fName) bareName = nt.guessSeriesFromFilename(fName) # if not matchName or not matchName in nt.dirNameProxy: if not nt.haveCanonicalMangaUpdatesName(bareName): item["seriesName"] = settings.ircBot["unknown-series"] else: item["seriesName"] = nt.getCanonicalMangaUpdatesName(bareName) self.log.info("Guessed = '%s'. Updating series information", item['seriesName']) self.updateDbEntry(item["sourceUrl"], seriesName=item["seriesName"]) dlPath, newDir = self.locateOrCreateDirectoryForSeries( item["seriesName"]) if item["flags"] == None: item["flags"] = "" if newDir: self.updateDbEntry(item["sourceUrl"], flags=" ".join([item["flags"], "haddir"])) self.conn.commit() fqFName = os.path.join(dlPath, fName) loop = 1 fName, ext = os.path.splitext(fName) while os.path.exists(fqFName): fName = "%s - (%d).%s" % (fName, loop, ext) fqFName = os.path.join(dlPath, fName) loop += 1 self.log.info("Saving to archive = %s", fqFName) self.updateDbEntry(item["sourceUrl"], downloadPath=dlPath, fileName=fName, originName=fName) return fqFName
def processLinksIntoDB(self, linksDicts): self.log.info( "Inserting...",) newItems = 0 for link in linksDicts: if link is None: print("linksDicts", linksDicts) print("WAT") row = self.getRowsByValue(sourceUrl=link["sourceUrl"], limitByKey=False) if not row: newItems += 1 if not "dlState" in link: link['dlState'] = 0 # Patch series name. if 'seriesName' in link and self.shouldCanonize: link["seriesName"] = nt.getCanonicalMangaUpdatesName(link["seriesName"]) # Using fancy dict hijinks now. Old call below for reference. # self.insertIntoDb(retreivalTime = link["date"], # sourceUrl = link["dlLink"], # originName = link["dlName"], # dlState = 0, # seriesName = link["baseName"], # flags = flagStr) self.insertIntoDb(**link) self.log.info("New item: %s", link) self.log.info( "Done") self.log.info( "Committing...",) self.conn.commit() self.log.info( "Committed") return newItems
def findIfMigrated(self, filePath): dirPath, fileName = os.path.split(filePath) series = dirPath.split("/")[-1] series = nt.getCanonicalMangaUpdatesName(series) otherDir = nt.dirNameProxy[series] if not otherDir["fqPath"]: return False if otherDir["fqPath"] == dirPath: return False newPath = os.path.join(otherDir["fqPath"], fileName) if os.path.exists(newPath): print("File moved!") return otherDir["fqPath"] return False
def getDownloadPath(self, item, fName): if not item['seriesName']: self.log.info("No series set for item. Guessing from filename:") self.log.info("Filename = '%s'", fName) bareName = nt.guessSeriesFromFilename(fName) # if not matchName or not matchName in nt.dirNameProxy: if not nt.haveCanonicalMangaUpdatesName(bareName): item["seriesName"] = settings.ircBot["unknown-series"] else: item["seriesName"] = nt.getCanonicalMangaUpdatesName(bareName) self.log.info("Guessed = '%s'. Updating series information", item['seriesName']) self.updateDbEntry(item["sourceUrl"], seriesName=item["seriesName"]) dlPath, newDir = self.locateOrCreateDirectoryForSeries(item["seriesName"]) if item["flags"] == None: item["flags"] = "" if newDir: self.updateDbEntry(item["sourceUrl"], flags=" ".join([item["flags"], "haddir"])) self.conn.commit() fqFName = os.path.join(dlPath, fName) loop = 1 fName, ext = os.path.splitext(fName) while os.path.exists(fqFName): fName = "%s - (%d).%s" % (fName, loop, ext) fqFName = os.path.join(dlPath, fName) loop += 1 self.log.info("Saving to archive = %s", fqFName) self.updateDbEntry(item["sourceUrl"], downloadPath=dlPath, fileName=fName, originName=fName) return fqFName
def locateOrCreateDirectoryForSeries(self, seriesName): if self.shouldCanonize: canonSeriesName = nt.getCanonicalMangaUpdatesName(seriesName) else: canonSeriesName = seriesName safeBaseName = nt.makeFilenameSafe(canonSeriesName) if canonSeriesName in nt.dirNameProxy: self.log.info("Have target dir for '%s' Dir = '%s'", canonSeriesName, nt.dirNameProxy[canonSeriesName]['fqPath']) return nt.dirNameProxy[canonSeriesName]["fqPath"], False else: self.log.info("Don't have target dir for: %s, full name = %s", canonSeriesName, seriesName) targetDir = os.path.join(settings.baseDir, safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) return targetDir, True except FileExistsError: # Probably means the directory was concurrently created by another thread in the background? self.log.critical( "Directory doesn't exist, and yet it does?") self.log.critical(traceback.format_exc()) pass except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning( "Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) self.log.warning("Base series name: %s", seriesName) self.log.warning("Canonized series name: %s", canonSeriesName) self.log.warning("Safe canonized name: %s", safeBaseName) return targetDir, False
def updateDbEntry(self, sourceUrl, commit=True, **kwargs): # Patch series name. if "seriesName" in kwargs and kwargs["seriesName"] and self.shouldCanonize: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName(kwargs["seriesName"]) # Clamp the retreivaltime to now, so parsing issues that result in invalid, future # time-stamps don't cause posts to stick to the top of the post list. if "retreivalTime" in kwargs: if kwargs["retreivalTime"] > time.time(): kwargs["retreivalTime"] = time.time() query, queryArguments = self.generateUpdateQuery(sourceUrl=sourceUrl, **kwargs) if self.QUERY_DEBUG: print("Query = ", query) print("Args = ", queryArguments) with self.conn.cursor() as cur: with transaction(cur, commit=commit): cur.execute(query, queryArguments)
def loadRemoteDirectory(self, fullPath, aggregate=False): ret = {} for dirName, stats in self.ftp.mlsd(fullPath): dirName = ftfy.fix_text(dirName) # Skip items that aren't directories if stats["type"] != "dir": continue canonName = nt.getCanonicalMangaUpdatesName(dirName) matchingName = nt.prepFilenameForMatching(canonName) fqPath = os.path.join(fullPath, dirName) if matchingName in ret: if aggregate: matchName = os.path.split(ret[matchingName])[-1] try: fqPath = self.aggregateDirs(fullPath, dirName, matchName) except ValueError: traceback.print_exc() except ftplib.error_perm: traceback.print_exc() else: if COMPLAIN_ABOUT_DUPS: self.log.warning( "Duplicate directories for series '%s'!", canonName) self.log.warning(" '%s/%s'", fullPath, dirName) self.log.warning(" '%s/%s'", fullPath, matchingName) ret[matchingName] = fqPath else: ret[matchingName] = fqPath return ret
def locateOrCreateDirectoryForSeries(self, seriesName): if self.shouldCanonize: canonSeriesName = nt.getCanonicalMangaUpdatesName(seriesName) else: canonSeriesName = seriesName safeBaseName = nt.makeFilenameSafe(canonSeriesName) if canonSeriesName in nt.dirNameProxy: self.log.info( "Have target dir for '%s' Dir = '%s'", canonSeriesName, nt.dirNameProxy[canonSeriesName]["fqPath"] ) return nt.dirNameProxy[canonSeriesName]["fqPath"], False else: self.log.info("Don't have target dir for: %s, full name = %s", canonSeriesName, seriesName) targetDir = os.path.join(settings.baseDir, safeBaseName) if not os.path.exists(targetDir): try: os.makedirs(targetDir) return targetDir, True except FileExistsError: # Probably means the directory was concurrently created by another thread in the background? self.log.critical("Directory doesn't exist, and yet it does?") self.log.critical(traceback.format_exc()) pass except OSError: self.log.critical("Directory creation failed?") self.log.critical(traceback.format_exc()) else: self.log.warning("Directory not found in dir-dict, but it exists!") self.log.warning("Directory-Path: %s", targetDir) self.log.warning("Base series name: %s", seriesName) self.log.warning("Canonized series name: %s", canonSeriesName) self.log.warning("Safe canonized name: %s", safeBaseName) return targetDir, False
def _processLinksIntoDB(self, linksDicts): self.log.info( "Inserting...",) newItems = 0 for link in linksDicts: if link is None: print("linksDicts", linksDicts) print("WAT") continue row = self.getRowsByValue(sourceUrl=link["sourceUrl"], limitByKey=False) if not row: newItems += 1 if not "dlState" in link: link['dlState'] = 0 # Patch series name. if 'seriesName' in link and self.shouldCanonize: link["seriesName"] = nt.getCanonicalMangaUpdatesName(link["seriesName"]) self.insertIntoDb(**link) self.log.info("New item: %s", link) if self.mon_con: self.mon_con.incr('new_links', newItems) self.log.info( "Done (%s new items)", newItems) return newItems
def updateSeriesDbEntryById(self, rowId, commit=True, **kwargs): # Patch series name. if "seriesName" in kwargs and kwargs["seriesName"]: kwargs["seriesName"] = nt.getCanonicalMangaUpdatesName( kwargs["seriesName"]) queries = [] qArgs = [] for key in kwargs.keys(): if key not in self.validSeriesKwargs: raise ValueError("Invalid keyword argument: %s" % key) else: queries.append("{k}=%s".format(k=key)) qArgs.append(kwargs[key]) qArgs.append(rowId) column = ", ".join(queries) query = '''UPDATE {tableName} SET {v} WHERE dbId=%s;'''.format( tableName=self.seriesTableName, v=column) if QUERY_DEBUG: print("Query = ", query) print("Args = ", qArgs) with self.context_cursor() as cur: if commit: cur.execute("BEGIN;") cur.execute(query, qArgs) if commit: cur.execute("COMMIT;")
def consolidateMangaFolders(dirPath, smartMode=True): idLut = nt.MtNamesMapWrapper("fsName->buId") pc = PathCleaner() count = 0 print("Dir", dirPath) items = os.listdir(dirPath) items.sort() for item in items: item = os.path.join(dirPath, item) if os.path.isdir(item): fPath, dirName = os.path.split(item) lookup = nt.dirNameProxy[dirName] if lookup["fqPath"] != item: print() print() print("------------------------------------------------------") canonName = nt.getCanonicalMangaUpdatesName(dirName) print("Duplicate Directory '%s' - Canon = '%s'" % (dirName, canonName)) count += 1 mtId = idLut[nt.prepFilenameForMatching(dirName)] for num in mtId: print(" URL: https://www.mangaupdates.com/series.html?id=%s" % (num, )) fPath, dir2Name = os.path.split(lookup["fqPath"]) if not os.path.exists(item): print("'%s' has been removed. Skipping" % item) continue if not os.path.exists(lookup["fqPath"]): print("'%s' has been removed. Skipping" % lookup["fqPath"]) continue n1 = lv.distance(dirName, canonName) n2 = lv.distance(dir2Name, canonName) r1 = abs(nt.extractRatingToFloat(dirName)) r2 = abs(nt.extractRatingToFloat(dir2Name)) if "[complete]" in dirName.lower(): r1 += 0.1 if "[complete]" in dir2Name.lower(): r2 += 0.1 if "[wtf]" in dirName.lower(): r1 += 0.2 if "[wtf]" in dir2Name.lower(): r2 += 0.2 print(" 1: ", item) print(" 2: ", lookup["fqPath"]) print(" 1: ", dirName, ' ->', nt.getCanonicalMangaUpdatesName(dirName)) print(" 2: ", dir2Name, ' ->', nt.getCanonicalMangaUpdatesName(dir2Name)) print(" 1: ({num} items)(distance {dist})(rating {rat})".format(num=len(os.listdir(item)), dist=n1, rat=r1)) print(" 2: ({num} items)(distance {dist})(rating {rat})".format(num=len(os.listdir(lookup["fqPath"])), dist=n2, rat=r2)) mtId2 = idLut[nt.prepFilenameForMatching(dir2Name)] if mtId != mtId2: print("DISCORDANT ID NUMBERS - {num1}, {num2}!".format(num1=mtId, num2=mtId2)) for num in mtId2: print(" URL: https://www.mangaupdates.com/series.html?id=%s" % (num, )) continue if r1 > r2: doMove = "reverse" elif r2 > r1: doMove = "forward" else: doMove = '' if not doMove or not smartMode: doMove = query_response("move files ('f' dir 1 -> dir 2. 'r' dir 1 <- dir 2. 'l' use levenshtein distance. 'n' do not move)?") if doMove == "forward": print("Forward move") fromDir = item toDir = lookup["fqPath"] elif doMove == "reverse": print("Reverse move") fromDir = lookup["fqPath"] toDir = item elif doMove == "levenshtein": print("Levenshtein distance chooser") # I'm using less then or equal, so situations where # both names are equadistant get aggregated anyways. if n1 <= n2: fromDir = lookup["fqPath"] toDir = item else: fromDir = item toDir = lookup["fqPath"] else: print("Skipping") continue print("moving from: '%s' " % fromDir) print(" to: '%s' " % toDir) items = os.listdir(fromDir) for item in items: fromPath = os.path.join(fromDir, item) toPath = os.path.join(toDir, item) loop = 2 while os.path.exists(toPath): pathBase, ext = os.path.splitext(toPath) print(" Duplicate file!") toPath = "{start} ({loop}){ext}".format(start=pathBase, loop=loop, ext=ext) print(" Moving: ", item) print(" From: ", fromPath) print(" To: ", toPath) pc.moveFile(fromPath, toPath) try: pc.moveFile(fromPath, toPath) except psycopg2.IntegrityError: print("Error moving item in dedup database") # pc.deletePath(toPath) shutil.move(fromPath, toPath) print("Deleting directory") os.rmdir(fromDir) print("total items", count)
def getLink(self, link): sourceUrl = link["sourceUrl"] print("Link", link) seriesName = link['seriesName'] try: self.log.info( "Should retreive url - %s", sourceUrl) self.updateDbEntry(sourceUrl, dlState=1) seriesName = nt.getCanonicalMangaUpdatesName(seriesName) self.log.info("Downloading = '%s', '%s'", seriesName, link["originName"]) dlPath, newDir = self.locateOrCreateDirectoryForSeries(seriesName) if link["flags"] == None: link["flags"] = "" if newDir: self.updateDbEntry(sourceUrl, flags=" ".join([link["flags"], "haddir"])) self.conn.commit() chapterName = nt.makeFilenameSafe(link["originName"]) fqFName = os.path.join(dlPath, chapterName+" [MangaHere].zip") loop = 1 prefix, ext = os.path.splitext(fqFName) while os.path.exists(fqFName): fqFName = "%s (%d)%s" % (prefix, loop, ext) loop += 1 self.log.info("Saving to archive = %s", fqFName) images = self.proceduralGetImages(sourceUrl) self.log.info("Creating archive with %s images", len(images)) if not images: self.updateDbEntry(sourceUrl, dlState=-1, tags="error-404") return #Write all downloaded files to the archive. arch = zipfile.ZipFile(fqFName, "w") for imageName, imageContent in images: arch.writestr(imageName, imageContent) arch.close() dedupState = processDownload.processDownload(seriesName, fqFName, deleteDups=True, includePHash=True) self.log.info( "Done") filePath, fileName = os.path.split(fqFName) self.updateDbEntry(sourceUrl, dlState=2, downloadPath=filePath, fileName=fileName, tags=dedupState) return except Exception: self.log.critical("Failure on retreiving content at %s", sourceUrl) self.log.critical("Traceback = %s", traceback.format_exc()) self.updateDbEntry(sourceUrl, dlState=-1) raise
def loadRemoteDirectory(self, fullPath, aggregate=False): ret = {} dirs = self.wg.getpage("https://manga.madokami.al/stupidapi/fakedirs") requirePrefix = splitall(fullPath) badwords = [ 'Non-English', 'Oneshots', 'Raws', 'Novels', '_Doujinshi', 'AutoUploaded from Assorted Sources', ] rows = [tmp for tmp in [splitall(item) for item in [item[1:] if item.startswith("./") else item for item in dirs.split("\n")] ] if ( len(tmp) >= len(requirePrefix) and all([tmp[x] == requirePrefix[x] for x in range(len(requirePrefix))]) and not any([badword in tmp for badword in badwords])) ] print(len(rows)) for line in rows: if len(line) == 6: dirName = line[-1] if not dirName: continue canonName = nt.getCanonicalMangaUpdatesName(dirName) matchingName = nt.prepFilenameForMatching(canonName) # prepFilenameForMatching can result in empty directory names in some cases. # Detect that, and don't bother with it if that happened. if not matchingName: continue fqPath = os.path.join(*line) fullPath = os.path.join(*line[:-1]) if matchingName in ret: tmp = ret[matchingName] matchpath, matchName = os.path.split(tmp[-1]) if isinstance(tmp, list): tmp = tmp.pop() if aggregate: try: fqPath = self.aggregateDirs(fullPath, matchpath,dirName, matchName) except CanonMismatch: pass except ValueError: traceback.print_exc() except ftplib.error_perm: traceback.print_exc() except PermissionError: traceback.print_exc() else: if COMPLAIN_ABOUT_DUPS: self.log.warning("Duplicate directories for series '%s'!", canonName) self.log.warning(" '%s/%s'", fullPath, dirName) self.log.warning(" '%s/%s'", matchpath, matchName) ret[matchingName].append(fqPath) else: ret[matchingName] = [fqPath] return ret
def consolidateMangaFolders(dirPath, smartMode=True): idLut = nt.MtNamesMapWrapper("fsName->buId") pc = PathCleaner() pc.openDB() count = 0 print("Dir", dirPath) items = os.listdir(dirPath) items.sort() for item in items: item = os.path.join(dirPath, item) if os.path.isdir(item): fPath, dirName = os.path.split(item) lookup = nt.dirNameProxy[dirName] if lookup["fqPath"] != item: print() print() print("------------------------------------------------------") canonName = nt.getCanonicalMangaUpdatesName(dirName) print("Duplicate Directory '%s' - Canon = '%s'" % (dirName, canonName)) count += 1 mtId = idLut[nt.prepFilenameForMatching(dirName)] for num in mtId: print( " URL: https://www.mangaupdates.com/series.html?id=%s" % (num, )) fPath, dir2Name = os.path.split(lookup["fqPath"]) if not os.path.exists(item): print("'%s' has been removed. Skipping" % item) continue if not os.path.exists(lookup["fqPath"]): print("'%s' has been removed. Skipping" % lookup["fqPath"]) continue n1 = lv.distance(dirName, canonName) n2 = lv.distance(dir2Name, canonName) r1 = abs(nt.extractRatingToFloat(dirName)) r2 = abs(nt.extractRatingToFloat(dir2Name)) if "[complete]" in dirName.lower(): r1 += 0.1 if "[complete]" in dir2Name.lower(): r2 += 0.1 if "[wtf]" in dirName.lower(): r1 += 0.2 if "[wtf]" in dir2Name.lower(): r2 += 0.2 print(" 1: ", item) print(" 2: ", lookup["fqPath"]) print(" 1: ", dirName, ' ->', nt.getCanonicalMangaUpdatesName(dirName)) print(" 2: ", dir2Name, ' ->', nt.getCanonicalMangaUpdatesName(dir2Name)) print( " 1: ({num} items)(distance {dist})(rating {rat})".format( num=len(os.listdir(item)), dist=n1, rat=r1)) print( " 2: ({num} items)(distance {dist})(rating {rat})".format( num=len(os.listdir(lookup["fqPath"])), dist=n2, rat=r2)) mtId2 = idLut[nt.prepFilenameForMatching(dir2Name)] if mtId != mtId2: print("DISCORDANT ID NUMBERS - {num1}, {num2}!".format( num1=mtId, num2=mtId2)) for num in mtId2: print( " URL: https://www.mangaupdates.com/series.html?id=%s" % (num, )) continue if r1 > r2: doMove = "reverse" elif r2 > r1: doMove = "forward" else: doMove = '' if not doMove or not smartMode: doMove = query_response( "move files ('f' dir 1 -> dir 2. 'r' dir 1 <- dir 2. 'l' use levenshtein distance. 'n' do not move)?" ) if doMove == "forward": print("Forward move") fromDir = item toDir = lookup["fqPath"] elif doMove == "reverse": print("Reverse move") fromDir = lookup["fqPath"] toDir = item elif doMove == "levenshtein": print("Levenshtein distance chooser") # I'm using less then or equal, so situations where # both names are equadistant get aggregated anyways. if n1 <= n2: fromDir = lookup["fqPath"] toDir = item else: fromDir = item toDir = lookup["fqPath"] else: print("Skipping") continue print("moving from: '%s' " % fromDir) print(" to: '%s' " % toDir) items = os.listdir(fromDir) for item in items: fromPath = os.path.join(fromDir, item) toPath = os.path.join(toDir, item) loop = 2 while os.path.exists(toPath): pathBase, ext = os.path.splitext(toPath) print(" Duplicate file!") toPath = "{start} ({loop}){ext}".format(start=pathBase, loop=loop, ext=ext) print(" Moving: ", item) print(" From: ", fromPath) print(" To: ", toPath) pc.moveFile(fromPath, toPath) try: pc.moveFile(fromPath, toPath) except psycopg2.IntegrityError: print("Error moving item in dedup database") # pc.deletePath(toPath) shutil.move(fromPath, toPath) print("Deleting directory") os.rmdir(fromDir) print("total items", count)
def processLinksIntoDB(self, linksDicts, isPicked=False): self.log.info( "Inserting...",) newItems = 0 oldItems = 0 for link in linksDicts: if link is None: print("linksDicts", linksDicts) print("WAT") rows = self.getRowsByValue(originName = link["dlName"]) #We only look at filenames to determine uniqueness, if not rows: rows = self.getRowsByValue(sourceUrl = link["dlLink"]) #Check against URLs as well, so we don't break the UNIQUE constraint if not rows: newItems += 1 # Patch series name. seriesName = nt.getCanonicalMangaUpdatesName(link["baseName"]) # Flags has to be an empty string, because the DB is annoying. # TL;DR, comparing with LIKE in a column that has NULLs in it is somewhat broken. self.insertIntoDb(retreivalTime = link["date"], sourceUrl = link["dlLink"], originName = link["dlName"], dlState = 0, seriesName = seriesName, flags = '', commit = False) # Defer commiting changes to speed things up self.log.info("New item: %s", (link["date"], link["dlLink"], link["baseName"], link["dlName"])) elif len(rows) > 1: self.log.warning("Have more then one item for filename! Wat?") self.log.warning("Info dict for file:") self.log.warning("'%s'", link) self.log.warning("Found rows:") self.log.warning("'%s'", rows) elif len(rows) == 1: row = rows.pop() if row["sourceUrl"] != link["dlLink"]: self.log.info("File has been moved!") self.log.info("File: '%s'", link) self.updateDbEntryById(row["dbId"], sourceUrl = link["dlLink"]) else: oldItems += 1 # self.log.info("Existing item: %s", (link["date"], link["dlName"])) else: row = row.pop() self.log.info( "Done") if newItems: self.log.info( "Committing...",) self.conn.commit() self.log.info( "Committed") else: self.log.info("No new items, %s old items.", oldItems) return newItems
def consolidateSeriesNaming(self): cur = self.conn.cursor() # cur.execute("BEGIN;") # print("Querying") # cur.execute("SELECT DISTINCT(seriesName) FROM {tableName};".format(tableName=self.tableName)) # print("Queried. Fetching results") # ret = cur.fetchall() # cur.execute("COMMIT;") # print("Have results. Processing") # for item in ret: # item = item[0] # if not item: # continue # mId = nt.getMangaUpdatesId(item) # if not mId: # print("Item '{old}', '{new}', mid:{mid}".format(old=item, new=nt.getCanonicalMangaUpdatesName(item), mid=mId)) # print("Total: ", len(ret)) items = ["Murciélago", "Murcielago", "Murciélago"] for item in items: print("------", item, nt.getCanonicalMangaUpdatesName(item), nt.haveCanonicalMangaUpdatesName(item)) # cur.execute("BEGIN;") # print("Querying") # cur.execute("SELECT DISTINCT ON (buname) buname, buId FROM mangaseries ORDER BY buname, buid;") # print("Queried. Fetching results") # ret = cur.fetchall() # cur.execute("COMMIT;") # print("Have results. Processing") # cur.execute("BEGIN;") # missing = 0 # for item in ret: # buName, buId = item # if not buName: # continue # cur.execute("SELECT * FROM munamelist WHERE name=%s;", (buName, )) # ret = cur.fetchall() # # mId = nt.getMangaUpdatesId(buName) # if not ret: # print("Item missing '{item}', mid:{mid}".format(item=item, mid=ret)) # self.insertNames(buId, [buName]) # missing += 1 # if not runStatus.run: # break # # print("Item '{old}', '{new}', mid:{mid}".format(old=item, new=nt.getCanonicalMangaUpdatesName(item), mid=mId)) # print("Total: ", len(ret)) # print("Missing: ", missing) # for dbId, sourceUrl in ret: # if "batoto" in sourceUrl.lower(): # sourceUrl = sourceUrl.replace("http://www.batoto.net/", "http://bato.to/") # print("Link", sourceUrl) # cur.execute("SELECT dbId FROM {tableName} WHERE sourceUrl=%s;".format(tableName=self.tableName), (sourceUrl, )) # ret = cur.fetchall() # if not ret: # print("Updating") # cur.execute("UPDATE {tableName} SET sourceUrl=%s WHERE dbId=%s;".format(tableName=self.tableName), (sourceUrl, dbId)) # else: # print("Replacing") # cur.execute("DELETE FROM {tableName} WHERE sourceUrl=%s;".format(tableName=self.tableName), (sourceUrl, )) # cur.execute("UPDATE {tableName} SET sourceUrl=%s WHERE dbId=%s;".format(tableName=self.tableName), (sourceUrl, dbId)) cur.execute("COMMIT;")
def parseTwoArgCall(cmd, val): if cmd == "import": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return autoImporter.importDirectories(val) elif cmd == "organize": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return autOrg.organizeFolder(val) elif cmd == "run": utilities.runPlugin.runPlugin(val) elif cmd == "rename": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return autOrg.renameSeriesToMatchMangaUpdates(val) elif cmd == "lookup": print("Passed name = '%s'" % val) import nameTools as nt haveLookup = nt.haveCanonicalMangaUpdatesName(val) if not haveLookup: print("Item not found in MangaUpdates name synonym table") print("Processed item as searched = '%s'" % nt.prepFilenameForMatching(val)) else: print("Item found in lookup table!") print("Canonical name = '%s'" % nt.getCanonicalMangaUpdatesName(val) ) elif cmd == "purge-dir": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return utilities.dedupDir.purgeDedupTemps(val) elif cmd == "purge-dir-phash": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return utilities.dedupDir.purgeDedupTempsPhash(val) elif cmd == "dirs-restore": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return utilities.dedupDir.runRestoreDeduper(val) elif cmd == "sort-dir-contents": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return utilities.approxFileSorter.scanDirectories(val) elif cmd == "clean-archives": if not os.path.exists(val): print("Passed path '%s' does not exist!" % val) return utilities.cleanFiles.cleanArchives(val) else: print("Did not understand command!") print("Sys.argv = ", sys.argv)