def start_download(self) -> None: if not self.gallery or not self.gallery.link: return logger.info( "Downloading an archive from a generic HTTP server: {}".format( self.gallery.link)) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get(self.gallery.link, stream='True', **request_dict) filename = get_filename_from_cd( request_file.headers.get('content-disposition')) if not filename: if self.gallery.link.find('/'): filename = self.gallery.link.rsplit('/', 1)[1] if not filename: logger.error("Could not find a filename for link: {}".format( self.gallery.link)) self.return_code = 0 self.gallery.title = filename.replace(".zip", "") self.gallery.filename = replace_illegal_name( available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, filename))) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.temp_archive: return logger.info( "Downloading an archive: {} from a Panda Backup-like source: {}". format(self.gallery.title, self.gallery.temp_archive['link'])) to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) # TODO: File could be cbz. request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['stream'] = True request_file = request_with_retries( self.gallery.temp_archive['link'], request_dict, ) if not request_file: logger.error("Could not download archive") self.return_code = 0 return filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.link or not self.gallery.archiver_key: return to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get(self.gallery.archiver_key, stream='True', **request_dict) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") os.remove(filepath) self.return_code = 0
def start_download(self) -> None: if not self.gallery: return self.logger.info( "Downloading an archive: {} from a Panda Backup-like source: {}". format(self.gallery.title, self.gallery.archiver_key['link'])) self.gallery.title = replace_illegal_name(self.gallery.title) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, self.gallery.title + '.zip')) request_file = requests.get(self.gallery.archiver_key['link'], stream='True', headers=self.settings.requests_headers, timeout=self.settings.timeout_timer, cookies=self.own_settings.cookies) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: self.logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery: return to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) if not (self.gallery.root and self.gallery.gid and self.gallery.token and self.gallery.archiver_key): logger.error( 'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.' .format( self.gallery.root, self.gallery.gid, self.gallery.token, self.gallery.archiver_key, )) self.return_code = 0 return r = self.request_archive_download(self.gallery.root, self.gallery.gid, self.gallery.token, self.gallery.archiver_key) if not r: logger.error('Could not get download link.') self.return_code = 0 return r.encoding = 'utf-8' if 'Invalid archiver key' in r.text: logger.error("Invalid archiver key received.") self.return_code = 0 else: archive_link = get_archive_link_from_html_page(r.text) if archive_link == '': logger.error( 'Could not find archive link, page text: {}'.format( r.text)) self.return_code = 0 else: m = re.match(r"(.*?)(\?.*?)", archive_link) if m: archive_link = m.group(1) logger.info('Got link: {}, from url: {}'.format( archive_link, r.url)) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get(archive_link + '?start=1', stream='True', **request_dict) if r and r.status_code == 200: logger.info( 'Downloading gallery: {}.zip'.format(to_use_filename)) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return if self.settings.gallery_dl.executable_path: exe_path_to_use = shutil.which( self.settings.gallery_dl.executable_path) else: exe_path_to_use = shutil.which( self.settings.gallery_dl.executable_name) if not exe_path_to_use: self.return_code = 0 logger.error("The gallery-dl executable was not found") return directory_path = mkdtemp() arguments = ["--zip", "--dest", "{}".format(directory_path)] if self.own_settings.proxy: arguments.append("--proxy") arguments.append("{}".format(self.own_settings.proxy)) if self.settings.gallery_dl.config_file: arguments.append("--config") arguments.append("{}".format(self.settings.gallery_dl.config_file)) if self.settings.gallery_dl.extra_arguments: arguments.append("{}".format( self.settings.gallery_dl.extra_arguments)) arguments.append("{}".format(self.gallery.link)) logger.info("Calling gallery-dl: {}.".format(" ".join( [exe_path_to_use, *arguments]))) process_result = subprocess.run([exe_path_to_use, *arguments], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) if process_result.stderr: self.return_code = 0 logger.error( "An error was captured when running gallery-dl: {}".format( process_result.stderr)) return if process_result.returncode != 0: self.return_code = 0 logger.error("Return code was not 0: {}".format( process_result.returncode)) return # If we downloaded more than one file, get the latest one output_path = '' file_name = '' for (dir_path, dir_names, filenames) in os.walk(directory_path): for current_file in filenames: file_name = current_file output_path = os.path.join(dir_path, current_file) if not output_path: self.return_code = 0 logger.error("The resulting download file was not found") return if not output_path or not os.path.isfile(output_path): self.return_code = 0 logger.error( "The resulting download file was not found: {}".format( file_name)) return self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, replace_illegal_name(file_name))) self.gallery.title = os.path.splitext(file_name)[0] filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) shutil.move(output_path, filepath) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def process_downloaded_archive(self, archive: Archive) -> None: if os.path.isfile(archive.zipped.path): except_at_open = False return_error = None try: my_zip = ZipFile( archive.zipped.path, 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: if 'panda' in archive.source_type: self.logger.error( "For archive: {}, file check on downloaded zipfile failed on file: {}, " "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk) if self.web_queue and archive.gallery: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings) return else: self.logger.warning( "For archive: {}, File check on downloaded zipfile: {}. " "Check the file manually.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) filesize = get_zip_filesize( archive.zipped.path) filecount = filecount_in_zip( archive.zipped.path) values = {'crc32': crc32, 'filesize': filesize, 'filecount': filecount, } updated_archive = Archive.objects.add_or_update_from_values( values, pk=archive.pk) if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize: if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize): self.logger.info( "For archive: {} size does not match gallery, " "but there's already another archive that matches.".format(updated_archive) ) return if 'panda' in archive.source_type: self.logger.info( "For archive: {} size does not match gallery, " "downloading again from panda_archive.".format(updated_archive) ) if self.web_queue: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list( (updated_archive.gallery.get_link(), ), override_options=temp_settings ) else: self.logger.warning( "For archive: {} size does not match gallery. Check the file manually.".format(archive) )
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return if self.own_settings.megadl_executable_path: exe_path_to_use = shutil.which( self.own_settings.megadl_executable_path) else: exe_path_to_use = shutil.which( self.own_settings.megadl_executable_name) if not exe_path_to_use: self.return_code = 0 self.logger.error("The megadl tools was not found") return directory_path = mkdtemp() arguments = [ "--no-progress", "--print-names", "--path", "{}".format(directory_path) ] if self.own_settings.proxy: arguments.append("--proxy") arguments.append("{}".format(self.own_settings.proxy)) if self.own_settings.extra_megadl_arguments: arguments.append("{}".format( self.own_settings.extra_megadl_arguments)) arguments.append("{}".format(self.gallery.link)) self.logger.info("Calling megadl: {}.".format(" ".join( [exe_path_to_use, *arguments]))) process_result = subprocess.run([exe_path_to_use, *arguments], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) message_text = process_result.stdout if not message_text: self.return_code = 0 self.logger.error( "The link could not be downloaded, no output was generated after running megadl" ) return if process_result.stderr: self.return_code = 0 self.logger.error( "An error was captured when running megadl: {}".format( process_result.stderr)) return if "WARNING: Skipping invalid" in message_text: self.return_code = 0 self.logger.error( "The link could not be downloaded: {}".format(message_text)) return # If we downloaded a folder, just take the first result file_names = message_text.splitlines() file_name = file_names[0] output_path = os.path.join(directory_path, file_name) if not os.path.isfile(output_path): self.return_code = 0 self.logger.error( "The resulting download file was not found: {}".format( file_name)) return self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, replace_illegal_name(file_name))) self.gallery.title = os.path.splitext(file_name)[0] filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) shutil.move(output_path, filepath) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: self.logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return to_use_filename = get_base_filename_string_from_gallery_data(self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join( self.own_settings.archive_dl_folder, to_use_filename + '.zip')) if self.gallery.content: soup_1 = BeautifulSoup(self.gallery.content, 'html.parser') else: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_page = requests.get( self.gallery.link, **request_dict ) soup_1 = BeautifulSoup(gallery_page.content, 'html.parser') gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href'] # Some URLs are really bad formatted gallery_read = re.sub( r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1', gallery_read, flags=re.DOTALL ) if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page): logger.warning("Reading gallery page not available, trying to guess the name.") gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery) if not gallery_read.endswith('page/1'): gallery_read += 'page/1' page_regex = re.compile(r"(.*?page/)(\d+)/*$", re.IGNORECASE) last_image = '' directory_path = mkdtemp() logger.info('Downloading gallery: {}'.format(self.gallery.title)) second_pass = False while True: try: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_read_page = requests.get( gallery_read, **request_dict ) except requests.exceptions.MissingSchema: logger.error("Malformed URL: {}, skipping".format(gallery_read)) self.return_code = 0 shutil.rmtree(directory_path, ignore_errors=True) return if gallery_read_page.status_code == 404: if gallery_read.endswith('page/1'): if not second_pass: gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False) second_pass = True continue logger.error("Last page was the first one: {}, stopping".format(gallery_read)) self.return_code = 0 shutil.rmtree(directory_path, ignore_errors=True) return # yield("Got to last gallery page, stopping") break soup_2 = BeautifulSoup(gallery_read_page.content, 'html.parser') img_find = soup_2.find("img", {"class": "open"}) if not img_find: logger.error("Gallery not available, skipping") self.return_code = 0 shutil.rmtree(directory_path, ignore_errors=True) return img = img_find['src'] if last_image != '' and last_image == img: # yield('Current image is the same as previous, skipping') break last_image = img img_name = os.path.basename(img) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get( img, **request_dict ) if request_file.status_code == 404: # yield("Got to last image, stopping") break with open(os.path.join(directory_path, img_name), "wb") as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) page_match = page_regex.search(gallery_read) if page_match: gallery_read = page_match.group(1) + str(int(page_match.group(2)) + 1) else: # yield("Could not match to change page, stopping") break file_path = os.path.join( self.settings.MEDIA_ROOT, self.gallery.filename ) with ZipFile(file_path, 'w') as archive: for (root_path, _, file_names) in os.walk(directory_path): for current_file in file_names: archive.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path) if self.gallery.filesize > 0: self.crc32 = calc_crc32(file_path) self.fileDownloaded = 1 self.return_code = 1
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return to_use_filename = get_base_filename_string_from_gallery_data(self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join( self.own_settings.archive_dl_folder, to_use_filename + '.zip')) if self.gallery.content: soup_1 = BeautifulSoup(self.gallery.content, 'html.parser') else: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_page = requests.get( self.gallery.link, **request_dict ) soup_1 = BeautifulSoup(gallery_page.content, 'html.parser') gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href'] # Some URLs are really bad formatted gallery_read = re.sub( r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1', gallery_read, flags=re.DOTALL ) if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page): logger.warning("Reading gallery page not available, trying to guess the name.") gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery) if not gallery_read.endswith('page/1'): gallery_read += 'page/1' logger.info('Downloading gallery: {}'.format(self.gallery.title)) try: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_read_page = requests.get( gallery_read, **request_dict ) except requests.exceptions.MissingSchema: logger.error("Malformed URL: {}, skipping".format(gallery_read)) self.return_code = 0 return if gallery_read_page.status_code != 200: gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False) try: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_read_page = requests.get( gallery_read, **request_dict ) except requests.exceptions.MissingSchema: logger.error("Malformed URL: {}, skipping".format(gallery_read)) self.return_code = 0 return if gallery_read_page.status_code == 200: image_urls = self.get_img_urls_from_gallery_read_page(gallery_read_page.text) if not image_urls: logger.error("Could not find image links, archive not downloaded") self.return_code = 0 return directory_path = mkdtemp() for image_url in image_urls: img_name = os.path.basename(image_url) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get( image_url, **request_dict ) if request_file.status_code == 404: logger.warning("Image link reported 404 error, stopping") break with open(os.path.join(directory_path, img_name), "wb") as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) file_path = os.path.join( self.settings.MEDIA_ROOT, self.gallery.filename ) with ZipFile(file_path, 'w') as archive: for (root_path, _, file_names) in os.walk(directory_path): for current_file in file_names: archive.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path) if self.gallery.filesize > 0: self.crc32 = calc_crc32(file_path) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Wrong HTML code returned, could not download, link: {}".format(gallery_read)) self.return_code = 0
def start_crawling(self, arg_line: List[str]) -> None: args = self.get_args(arg_line) if isinstance(args, ArgumentParserError): self.logger.info(str(args)) return files = [] do_not_replace = False values: DataDict = {} if args.remove_missing_files: found_archives = Archive.objects.all() if found_archives: self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count())) for archive in found_archives: if not os.path.isfile(archive.zipped.path): Archive.objects.delete_by_filter( pk=archive.pk) return elif args.display_missing_files: found_archives = Archive.objects.all() if found_archives: self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count())) for archive in found_archives: if not os.path.isfile(archive.zipped.path): self.logger.info("Filename: {} doesn't exist".format(archive.zipped.path)) return elif args.rematch_non_matches: self.settings.rematch_file_list = ['non-match'] self.settings.rematch_file = True found_archives = Archive.objects.filter( match_type='non-match') if found_archives: self.logger.info("Scanning {} archives with non-matches".format(found_archives.count())) for archive in found_archives: if os.path.isfile(archive.zipped.path): files.append(archive.zipped.path) elif args.rematch_by_match_type: self.settings.rematch_file_list = [args.rematch_by_match_type] self.settings.rematch_file = True self.settings.replace_metadata = True found_archives = Archive.objects.filter( match_type=args.rematch_by_match_type) if found_archives: self.logger.info("Scanning {} archives matched by {}".format( found_archives.count(), args.rematch_by_match_type )) for archive in found_archives: if os.path.isfile(archive.zipped.path): files.append(archive.zipped.path) elif args.rematch_wrong_filesize: self.settings.rematch_file = True self.settings.replace_metadata = True do_not_replace = True found_archives = Archive.objects.exclude( match_type='non-match', gallery_id__isnull=True) if found_archives: for archive in found_archives: if not os.path.isfile(archive.zipped.path): continue if archive.filesize == archive.gallery.filesize: continue files.append(archive.zipped.path) self.logger.info("Scanning {} archives matched with wrong filesize".format(len(files))) elif args.recalc_missing_crc32: found_archives = Archive.objects.filter(crc32='') if found_archives: self.logger.info("Calculating {} archives with missing CRC32".format(found_archives.count())) for cnt, archive in enumerate(found_archives): if os.path.isfile(archive.zipped.path): crc32 = calc_crc32( archive.zipped.path) self.logger.info("Working on archive {} of {}, CRC32: {}".format((cnt + 1), found_archives.count(), crc32)) values = {'crc32': crc32} Archive.objects.add_or_update_from_values( values, pk=archive.pk) else: self.logger.info("Archive {} of {}, path: {} does not exist".format( (cnt + 1), found_archives.count(), archive.zipped.path )) return elif args.all_filenames_to_title: archives_title_gid = Archive.objects.exclude( title='') if archives_title_gid: self.logger.info("Checking {} galleries".format(archives_title_gid.count())) for cnt, archive in enumerate(archives_title_gid): current_path = os.path.join(os.path.dirname( archive.zipped.path), replace_illegal_name(archive.title) + '.zip') if archive.zipped.path != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)): self.logger.info("Filename should be {} but it's {}".format(current_path, archive.zipped.path)) if args.filename_to_title == 'rename': os.rename(archive.zipped.path, os.path.join( self.settings.MEDIA_ROOT, current_path)) values = {'zipped': current_path, } Archive.objects.add_or_update_from_values( values, pk=archive.pk) return elif args.rematch_from_internal_gallery_titles: non_matched_archives = Archive.objects.filter( match_type='non-match') if non_matched_archives: archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles() self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count())) for archive in non_matched_archives: adjusted_title = replace_illegal_name( os.path.basename(archive.zipped.path)).replace(".zip", "") galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, galleries_title_gid, args.rematch_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0])) values = { 'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title, 'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn, 'zipped': archive.zipped.path, 'crc32': archive.crc32, 'match_type': 'gallery_database', 'filesize': archive.filesize, 'filecount': archive.filecount, 'gallery_id': galleries_id_token[1] } Archive.objects.add_or_update_from_values( values, pk=archive.pk) Gallery.objects.update_by_dl_type( {"dl_type": "folder:filename"}, galleries_id_token[1], "failed") else: galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, archives_title_gid, args.rematch_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0])) values = { 'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title, 'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn, 'zipped': archive.zipped.path, 'crc32': archive.crc32, 'match_type': archive.match_type, 'filesize': archive.filesize, 'filecount': archive.filecount, 'gallery_id': galleries_id_token[1] } Archive.objects.add_or_update_from_values( values, pk=archive.pk) return elif args.display_match_from_internal_gallery_titles: non_matched_archives = Archive.objects.filter( match_type='non-match') if non_matched_archives: archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles() self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count())) for archive in non_matched_archives: adjusted_title = replace_illegal_name( os.path.basename(archive.zipped.path)).replace(".zip", "") galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, galleries_title_gid, args.display_match_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0])) else: galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, archives_title_gid, args.display_match_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0])) return else: for folder in args.folder: p = os.path.normpath(os.path.join(self.settings.MEDIA_ROOT, folder)) if not p.startswith(self.settings.MEDIA_ROOT): continue folder = os.path.relpath(p, self.settings.MEDIA_ROOT).replace("\\", "/") if os.path.isdir(os.path.join(self.settings.MEDIA_ROOT, folder)): for root, _, filenames in os.walk(os.path.join(self.settings.MEDIA_ROOT, str(folder))): for filename in fnmatch.filter(filenames, self.settings.filename_filter): files.append( os.path.relpath(os.path.join(root, filename), self.settings.MEDIA_ROOT)) elif os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, folder)): files.append(folder) if args.rename_to_title: self.logger.info("Checking {} galleries".format(len(files))) for cnt, filepath in enumerate(files): archive = Archive.objects.filter(zipped=filepath).first() if archive: current_path = os.path.join( os.path.dirname(filepath), replace_illegal_name(archive.title) + '.zip') if filepath != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)): self.logger.info("Filename should be {} but it's {}".format(current_path, filepath)) if args.rename_to_title == 'rename': os.rename(os.path.join(self.settings.MEDIA_ROOT, filepath), os.path.join( self.settings.MEDIA_ROOT, current_path)) values = {'zipped': current_path, } Archive.objects.add_or_update_from_values( values, zipped=filepath) return if args.set_reason: self.settings.archive_reason = args.set_reason if args.set_source: self.settings.archive_source = args.set_source # The creation of the files list ends here. From here onwards, it's processing them. if len(files) == 0: self.logger.info("No file matching needed, skipping matchers") else: self.logger.info("Starting checks for {} archives".format(len(files))) matchers_list = self.settings.provider_context.get_matchers(self.settings, logger=self.logger) for matcher in matchers_list: self.logger.info("Using matcher {} with a priority of {}".format(matcher[0].name, matcher[1])) for cnt, filepath in enumerate(files): self.logger.info("Checking file: {} of {}, path: {}".format((cnt + 1), len(files), filepath)) title = re.sub( '[_]', ' ', os.path.splitext(os.path.basename(filepath))[0]) archive = Archive.objects.filter(zipped=filepath).first() if not self.settings.rehash_files and archive: crc32 = archive.crc32 else: crc32 = calc_crc32( os.path.join(self.settings.MEDIA_ROOT, filepath)) if archive: if args.force_rematch: self.logger.info("Doing a forced rematch") elif archive.match_type in self.settings.rematch_file_list or args.rematch_wrong_filesize: if self.settings.rematch_file: self.logger.info("File was already matched before, but rematch is ordered") else: self.logger.info("File was already matched before, not rematching") continue else: self.logger.info("Match already saved, skipping") continue else: # Test for corrupt files except_at_open = False return_error = None try: my_zip = ZipFile( os.path.join(self.settings.MEDIA_ROOT, filepath), 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: self.logger.warning("File check on zipfile failed on file: {}, marking as corrupt.".format(filepath)) values = { 'title': title, 'title_jpn': '', 'zipped': filepath, 'crc32': crc32, 'match_type': 'corrupt', 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'source_type': 'folder' } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) Archive.objects.update_or_create_by_values_and_gid( values, None, zipped=filepath) continue # Look for previous matches archive = Archive.objects.filter(crc32=crc32).first() if archive: if self.settings.copy_match_file: self.logger.info("Found previous match by CRC32, copying its values") values = { 'title': archive.title, 'title_jpn': archive.title_jpn, 'zipped': filepath, 'crc32': crc32, 'match_type': archive.match_type, 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'gallery_id': archive.gallery_id, 'source_type': archive.source_type } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) Archive.objects.add_or_update_from_values( values, zipped=filepath) continue else: self.logger.info("Matching independently and ignoring previous match") match_result = False start_time = time.perf_counter() match_type = '' match_title = '' match_link = '' match_count = 0 for i, matcher in enumerate(matchers_list): if i > 0: time.sleep(self.settings.wait_timer) self.logger.info("Matching with: {}".format(matcher[0])) if matcher[0].start_match(filepath, crc32): match_type = matcher[0].found_by match_title = matcher[0].match_title or '' match_link = matcher[0].match_link or '' match_count = matcher[0].match_count match_result = True break end_time = time.perf_counter() self.logger.info("Time taken to match file {}: {:.2f} seconds.".format(filepath, (end_time - start_time))) if not match_result and not do_not_replace: self.logger.info('Could not match with any matcher, adding as non-match.') values = { 'title': title, 'title_jpn': '', 'zipped': filepath, 'crc32': crc32, 'match_type': 'non-match', 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'source_type': 'folder' } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) archive = Archive.objects.update_or_create_by_values_and_gid( values, None, zipped=filepath) if self.settings.internal_matches_for_non_matches: self.logger.info('Generating possible internal matches.') archive.generate_possible_matches(cutoff=0.4, clear_title=True) self.logger.info('Generated matches for {}, found {}'.format( archive.zipped.path, archive.possible_matches.count() )) elif match_result: result_message = ( "Matched title: {}\n" "Matched link: {}\n" "Matched type: {}\n" "Match count: {}\n".format(match_title, match_link, match_type, match_count) ) self.logger.info(result_message) self.logger.info('Folder crawler done.')
def compare_by_hash(self, zip_path: str) -> bool: if not os.path.isfile(zip_path): return False crc32 = calc_crc32(zip_path) api_url = urljoin(self.own_settings.url, constants.api_path) logger.info("Querying URL: {}".format(api_url)) request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['params'] = {'match': True, 'crc32': crc32} response = request_with_retries( api_url, request_dict, post=False, retries=3 ) if not response: logger.info("Got no response from server") return False response_data = response.json() matches_links = set() if 'error' in response_data: logger.info("Got error from server: {}".format(response_data['error'])) return False for gallery in response_data: if 'link' in gallery: matches_links.add(gallery['link']) if 'gallery_container' in gallery and gallery['gallery_container']: if self.settings.gallery_model: gallery_container = self.settings.gallery_model.objects.filter( gid=gallery['gallery_container'], provider=gallery['provider'] ) first_gallery = gallery_container.first() if first_gallery: gallery['gallery_container_gid'] = first_gallery.gid if 'magazine' in gallery and gallery['magazine']: if self.settings.gallery_model: magazine = self.settings.gallery_model.objects.filter( gid=gallery['magazine'], provider=gallery['provider'] ) first_magazine = magazine.first() if first_magazine: gallery['magazine_gid'] = first_magazine.gid if 'posted' in gallery: if gallery['posted'] != 0: gallery['posted'] = datetime.fromtimestamp(int(gallery['posted']), timezone.utc) else: gallery['posted'] = None self.values_array.append(GalleryData(**gallery)) self.gallery_links = list(matches_links) if len(self.gallery_links) > 0: self.found_by = self.name return True else: return False