def get_archive_and_gallery_titles() -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]: found_galleries = Gallery.objects.eligible_for_use() found_archives = Archive.objects.exclude( match_type__in=('', 'non-match')) archives_title_gid = [] galleries_title_gid = [] for archive in found_archives: archives_title_gid.append( (replace_illegal_name(archive.title), archive.gallery_id)) for gallery in found_galleries: if 'replaced' in gallery.tag_list(): continue galleries_title_gid.append( (replace_illegal_name(gallery.title), gallery.id)) return archives_title_gid, galleries_title_gid
def start_download(self) -> None: if not self.gallery: return if not (self.gallery.root and self.gallery.gid and self.gallery.token and self.gallery.archiver_key): logger.error( 'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.' .format( self.gallery.root, self.gallery.gid, self.gallery.token, self.gallery.archiver_key, )) self.return_code = 0 return r = self.request_hath_download(self.gallery.root, self.gallery.gid, self.gallery.token, self.gallery.archiver_key) if r and r.status_code == 200: r.encoding = 'utf-8' soup = BeautifulSoup(r.content, 'html.parser') container = soup.find(text=re.compile( 'An original resolution download has been queued for client')) if not container: logger.error("Could not find expected text in response.") self.return_code = 0 return client_id = container.parent.find('strong') if client_id: logger.info("Queued download to client: {}".format( client_id.get_text())) to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join( self.own_settings.hath_dl_folder, replace_illegal_name(to_use_filename + " [" + str(self.gallery.gid) + "]") + '.zip')) self.fileDownloaded = 1 self.return_code = 1 else: if r: logger.error('Did not get a 200 response, text: {}'.format( r.text)) else: logger.error('Did not get a response') self.return_code = 0
def connect_and_download(self, client: TorrentClient, torrent_link: str) -> None: if not self.gallery: return None client.connect() if client.send_url: result = client.add_url( torrent_link, download_dir=self.settings.torrent['download_dir']) else: result = client.add_torrent( self.general_utils.get_torrent( torrent_link, self.own_settings.cookies, convert_to_base64=client.convert_to_base64), download_dir=self.settings.torrent['download_dir']) if result: if client.expected_torrent_name: self.expected_torrent_name = "{} [{}]".format( client.expected_torrent_name, self.gallery.gid) else: to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) self.expected_torrent_name = "{} [{}]".format( replace_illegal_name(to_use_filename), self.gallery.gid) if client.expected_torrent_extension: self.expected_torrent_extension = client.expected_torrent_extension else: self.expected_torrent_extension = ".zip" self.fileDownloaded = 1 self.return_code = 1 if client.total_size > 0: self.gallery.filesize = client.total_size self.gallery.filename = os.path.join( self.own_settings.torrent_dl_folder, replace_illegal_name(self.expected_torrent_name) + self.expected_torrent_extension) logger.info( "Torrent added, expecting downloaded name: {}, local name: {}". format(self.expected_torrent_name, self.gallery.filename)) else: self.return_code = 0 logger.error("There was an error adding the torrent to the client")
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return logger.info( "Downloading an archive from a generic HTTP server: {}".format( self.gallery.link)) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get(self.gallery.link, stream='True', **request_dict) filename = get_filename_from_cd( request_file.headers.get('content-disposition')) if not filename: if self.gallery.link.find('/'): filename = self.gallery.link.rsplit('/', 1)[1] if not filename: logger.error("Could not find a filename for link: {}".format( self.gallery.link)) self.return_code = 0 self.gallery.title = filename.replace(".zip", "") self.gallery.filename = replace_illegal_name( available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, filename))) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.temp_archive: return logger.info( "Downloading an archive: {} from a Panda Backup-like source: {}". format(self.gallery.title, self.gallery.temp_archive['link'])) to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) # TODO: File could be cbz. request_dict = construct_request_dict(self.settings, self.own_settings) request_dict['stream'] = True request_file = request_with_retries( self.gallery.temp_archive['link'], request_dict, ) if not request_file: logger.error("Could not download archive") self.return_code = 0 return filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.link or not self.gallery.archiver_key: return to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get(self.gallery.archiver_key, stream='True', **request_dict) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") os.remove(filepath) self.return_code = 0
def start_download(self) -> None: if not self.gallery: return self.logger.info( "Downloading an archive: {} from a Panda Backup-like source: {}". format(self.gallery.title, self.gallery.archiver_key['link'])) self.gallery.title = replace_illegal_name(self.gallery.title) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, self.gallery.title + '.zip')) request_file = requests.get(self.gallery.archiver_key['link'], stream='True', headers=self.settings.requests_headers, timeout=self.settings.timeout_timer, cookies=self.own_settings.cookies) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: self.logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return if self.settings.gallery_dl.executable_path: exe_path_to_use = shutil.which( self.settings.gallery_dl.executable_path) else: exe_path_to_use = shutil.which( self.settings.gallery_dl.executable_name) if not exe_path_to_use: self.return_code = 0 logger.error("The gallery-dl executable was not found") return directory_path = mkdtemp() arguments = ["--zip", "--dest", "{}".format(directory_path)] if self.own_settings.proxy: arguments.append("--proxy") arguments.append("{}".format(self.own_settings.proxy)) if self.settings.gallery_dl.config_file: arguments.append("--config") arguments.append("{}".format(self.settings.gallery_dl.config_file)) if self.settings.gallery_dl.extra_arguments: arguments.append("{}".format( self.settings.gallery_dl.extra_arguments)) arguments.append("{}".format(self.gallery.link)) logger.info("Calling gallery-dl: {}.".format(" ".join( [exe_path_to_use, *arguments]))) process_result = subprocess.run([exe_path_to_use, *arguments], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) if process_result.stderr: self.return_code = 0 logger.error( "An error was captured when running gallery-dl: {}".format( process_result.stderr)) return if process_result.returncode != 0: self.return_code = 0 logger.error("Return code was not 0: {}".format( process_result.returncode)) return # If we downloaded more than one file, get the latest one output_path = '' file_name = '' for (dir_path, dir_names, filenames) in os.walk(directory_path): for current_file in filenames: file_name = current_file output_path = os.path.join(dir_path, current_file) if not output_path: self.return_code = 0 logger.error("The resulting download file was not found") return if not output_path or not os.path.isfile(output_path): self.return_code = 0 logger.error( "The resulting download file was not found: {}".format( file_name)) return self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, replace_illegal_name(file_name))) self.gallery.title = os.path.splitext(file_name)[0] filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) shutil.move(output_path, filepath) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def match_internal(archives: ArchiveQuerySet, providers: Iterable[str], logger: OptionalLogger, cutoff: float = 0.4, max_matches: int = 20, match_by_filesize: bool = True) -> None: galleries_per_provider: Dict[str, GalleryQuerySet] = {} galleries_title_id_per_provider: Dict[str, List[Tuple[str, str]]] = {} if providers: for provider in providers: galleries_per_provider[ provider] = Gallery.objects.eligible_for_use( provider__contains=provider) else: galleries_per_provider['all'] = Gallery.objects.eligible_for_use() for provider, galleries in galleries_per_provider.items(): galleries_title_id_per_provider[provider] = list() for gallery in galleries: if gallery.title: galleries_title_id_per_provider[provider].append( (replace_illegal_name(gallery.title), gallery.pk)) if gallery.title_jpn: galleries_title_id_per_provider[provider].append( (replace_illegal_name(gallery.title_jpn), gallery.pk)) for i, archive in enumerate(archives, start=1): # type: ignore for provider, galleries_title_id in galleries_title_id_per_provider.items( ): if provider != 'all': matchers = crawler_settings.provider_context.get_matchers( crawler_settings, logger, filter_name="{}_title".format(provider), force=True) if matchers: adj_title = matchers[0][0].format_to_compare_title( archive.zipped.name) else: adj_title = get_title_from_path(archive.zipped.name) else: adj_title = get_title_from_path(archive.zipped.name) similar_list_provider = get_list_closer_gallery_titles_from_list( adj_title, galleries_title_id, cutoff, max_matches) if similar_list_provider is not None: for similar in similar_list_provider: gallery = Gallery.objects.get(pk=similar[1]) ArchiveMatches.objects.update_or_create( archive=archive, gallery=gallery, match_type='title', match_accuracy=similar[2]) if logger: logger.info( "{} of {}: Found {} matches (internal search) from title for archive: {}, using provider filter: {}" .format(i, archives.count(), len(similar_list_provider), archive.title, provider)) if not match_by_filesize or archive.filesize <= 0: continue galleries_same_size = Gallery.objects.filter(filesize=archive.filesize) if galleries_same_size.exists(): if logger: logger.info( "{} of {}: Found {} matches (internal search) from filesize for archive: {}" .format(i, str(archives.count()), str(galleries_same_size.count()), archive.title)) for similar_gallery in galleries_same_size: gallery = Gallery.objects.get(pk=similar_gallery.pk) ArchiveMatches.objects.update_or_create(archive=archive, gallery=gallery, match_type='size', match_accuracy=1)
def match_archives_from_gallery_titles(archives: ArchiveQuerySet, logger: OptionalLogger = None, cutoff: float = 0.4, max_matches: int = 20, provider: str = '') -> None: try: if not archives: non_match_archives = Archive.objects.filter(match_type='non-match') else: non_match_archives = archives if non_match_archives: galleries_title_id = [] if provider: galleries = Gallery.objects.eligible_for_use( provider__contains=provider) else: galleries = Gallery.objects.eligible_for_use() for gallery in galleries: if gallery.title: galleries_title_id.append( (replace_illegal_name(gallery.title), gallery.pk)) if gallery.title_jpn: galleries_title_id.append( (replace_illegal_name(gallery.title_jpn), gallery.pk)) if logger: logger.info("Trying to match against gallery database, " "{} archives with no match, matching against: {}, " "number of galleries: {}, cutoff: {}".format( non_match_archives.count(), provider, galleries.count(), cutoff)) for i, archive in enumerate(non_match_archives, start=1): matchers = crawler_settings.provider_context.get_matchers( crawler_settings, logger, filter_name="{}_title".format(provider), force=True) if matchers: adj_title = matchers[0][0].format_to_compare_title( archive.zipped.name) else: adj_title = get_title_from_path(archive.zipped.name) similar_list = get_list_closer_gallery_titles_from_list( adj_title, galleries_title_id, cutoff, max_matches) if similar_list is not None: archive.possible_matches.clear() if logger: logger.info( "{} of {}: Found {} matches from title for {}". format(i, non_match_archives.count(), len(similar_list), archive.zipped.name)) for similar in similar_list: gallery = Gallery.objects.get(pk=similar[1]) ArchiveMatches.objects.create( archive=archive, gallery=gallery, match_type='title', match_accuracy=similar[2]) if archive.filesize <= 0: continue galleries_same_size = Gallery.objects.filter( filesize=archive.filesize) if galleries_same_size.exists(): if logger: logger.info( "{} of {}: Found {} matches from filesize for {}". format(i, str(non_match_archives.count()), str(galleries_same_size.count()), archive.zipped.name)) for similar_gallery in galleries_same_size: gallery = Gallery.objects.get(pk=similar_gallery.pk) ArchiveMatches.objects.create(archive=archive, gallery=gallery, match_type='size', match_accuracy=1) if logger: logger.info("Matching ended") return except BaseException: thread_logger = logging.getLogger('viewer.threads') thread_logger.error(traceback.format_exc())
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return client = get_torrent_client(self.settings.torrent) if not client: self.return_code = 0 logger.error("No torrent client was found") return torrent_link = self.get_download_link(self.gallery.link) logger.info("Adding torrent to client.") client.connect() if client.send_url or torrent_link.startswith('magnet:'): result = client.add_url( torrent_link, download_dir=self.settings.torrent['download_dir']) else: torrent_data = self.general_utils.get_torrent( torrent_link, self.own_settings.cookies, convert_to_base64=client.convert_to_base64) result = client.add_torrent( torrent_data, download_dir=self.settings.torrent['download_dir']) if client.expected_torrent_name == '': from core.libs.bencoding import Decoder try: if client.convert_to_base64 and type(torrent_data) is str: torrent_data = cast(str, torrent_data) torrent_metadata = Decoder( base64.decodebytes( torrent_data.encode('utf-8'))).decode() else: torrent_data = cast(bytes, torrent_data) torrent_metadata = Decoder(torrent_data).decode() client.expected_torrent_name = os.path.splitext( torrent_metadata[b'info'][b'name'])[0] client.expected_torrent_extension = os.path.splitext( torrent_metadata[b'info'][b'name'])[1] except (RuntimeError, EOFError): self.return_code = 0 logger.error("Error decoding torrent data: {!r}".format( torrent_data)) return if result: if client.expected_torrent_name: self.expected_torrent_name = client.expected_torrent_name else: self.expected_torrent_name = "{}".format( replace_illegal_name(self.gallery.link)) if client.expected_torrent_extension: self.expected_torrent_extension = client.expected_torrent_extension else: self.expected_torrent_extension = ".zip" self.fileDownloaded = 1 self.return_code = 1 if client.total_size > 0: self.gallery.filesize = client.total_size else: self.gallery.filesize = 0 self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join( self.own_settings.torrent_dl_folder, replace_illegal_name(self.expected_torrent_name) + self.expected_torrent_extension)) else: self.return_code = 0 logger.error("There was an error adding the torrent to the client")
def copy_all_missing(self, mode, archives: Iterable[Archive] = None): files_torrent = [] files_hath = [] if not archives: found_archives: Iterable[Archive] = list(Archive.objects.filter_by_dl_remote()) else: found_archives = archives if not found_archives: return for archive in found_archives: if not os.path.isfile(archive.zipped.path): if 'torrent' in archive.match_type: files_torrent.append(archive) elif 'hath' in archive.match_type: files_hath.append(archive) if len(files_torrent) + len(files_hath) == 0: return # Hath downloads if len(files_hath) > 0: files_matched_hath = [] for matched_file in os.listdir(self.settings.providers['panda'].local_hath_folder): if os.path.isfile(os.path.join(self.settings.providers['panda'].local_hath_folder, matched_file)): continue m = re.search(r'.*?\[(\d+)\]$', matched_file) if m: for archive in files_hath: if m.group(1) == archive.gallery.gid: files_matched_hath.append( [matched_file, archive.zipped.path, int(archive.filesize), archive]) for img_dir in files_matched_hath: total_remote_size = 0 remote_files = [] directory = os.path.join(self.settings.providers['panda'].local_hath_folder, img_dir[0]) for img_file in os.listdir(directory): if not os.path.isfile(os.path.join(directory, img_file)) or img_file == 'galleryinfo.txt': continue total_remote_size += os.stat( os.path.join(directory, img_file)).st_size remote_files.append( os.path.join(directory, img_file)) if total_remote_size != img_dir[2]: self.logger.info( "For archive: {archive}, folder: {folder} " "has not completed the download ({current}/{total}), skipping".format( archive=img_dir[3], folder=img_dir[0], current=filesizeformat(total_remote_size), total=filesizeformat(img_dir[2]) ) ) continue self.logger.info( "For archive: {archive}, creating zip " "for folder {filename}, {image_count} images".format( archive=img_dir[3], filename=img_dir[1], image_count=len(remote_files) )) dir_path = mkdtemp() for img_file_original in remote_files: img_file = os.path.split(img_file_original)[1] if mode == 'local_move': shutil.move(img_file_original, os.path.join(dir_path, img_file)) else: shutil.copy(img_file_original, os.path.join(dir_path, img_file)) with ZipFile(os.path.join(self.settings.MEDIA_ROOT, img_dir[1]), 'w') as archive_file: for (root_path, _, file_names) in os.walk(dir_path): for current_file in file_names: archive_file.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(dir_path, ignore_errors=True) self.process_downloaded_archive(img_dir[3]) # Torrent downloads if len(files_torrent) > 0: files_matched_torrent = [] for filename in os.listdir(self.settings.torrent['download_dir']): for archive in files_torrent: if archive.gallery: cleaned_torrent_name = os.path.splitext( os.path.basename(archive.zipped.path))[0].replace(' [' + archive.gallery.gid + ']', '') else: cleaned_torrent_name = os.path.splitext(os.path.basename(archive.zipped.path))[0] if replace_illegal_name(os.path.splitext(filename)[0]) in cleaned_torrent_name: files_matched_torrent.append([filename, not os.path.isfile( os.path.join(self.settings.torrent['download_dir'], filename)), archive]) for matched_file in files_matched_torrent: target = os.path.join(self.settings.torrent['download_dir'], matched_file[0]) if matched_file[1]: self.logger.info( "For archive: {archive}, creating zip for folder: {filename}".format( archive=matched_file[2], filename=matched_file[0], )) dir_path = mkdtemp() for img_file in os.listdir(target): if not os.path.isfile(os.path.join(target, img_file)): continue if mode == 'local_move': shutil.move(os.path.join(target, img_file), os.path.join(dir_path, img_file)) else: shutil.copy(os.path.join(target, img_file), os.path.join(dir_path, img_file)) with ZipFile(matched_file[2].zipped.path, 'w') as archive_file: for (root_path, _, file_names) in os.walk(dir_path): for current_file in file_names: archive_file.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(dir_path, ignore_errors=True) else: self.logger.info( "For archive: {archive}, downloading file: {filename}".format( archive=matched_file[2], filename=matched_file[0], )) if mode == 'local_move': shutil.move(target, matched_file[2].zipped.path) else: shutil.copy(target, matched_file[2].zipped.path) if self.settings.convert_rar_to_zip and os.path.splitext(matched_file[0])[1].lower() == ".rar": self.logger.info( "For archive: {}, converting rar: {} to zip".format( matched_file[2], matched_file[2].zipped.path ) ) convert_rar_to_zip(matched_file[2].zipped.path) self.process_downloaded_archive(matched_file[2])
def download_all_missing(self, archives: Iterable[Archive] = None) -> None: files_torrent = [] files_hath = [] if not archives: found_archives: Iterable[Archive] = list(Archive.objects.filter_by_dl_remote()) else: found_archives = archives if not found_archives: return for archive in found_archives: if 'torrent' in archive.match_type: files_torrent.append(archive) elif 'hath' in archive.match_type: files_hath.append(archive) if len(files_torrent) + len(files_hath) == 0: return self.start_connection() if not self.ftps: self.logger.error( "Cannot download the archives, the FTP connection is not initialized." ) return None # Hath downloads if len(files_hath) > 0: self.set_current_dir(self.settings.providers['panda'].remote_hath_dir) # self.ftps.encoding = 'utf8' files_matched_hath = [] for line in self.ftps.mlsd(facts=["type"]): if line[1]["type"] != 'dir': continue m = re.search(r'.*?\[(\d+)\]$', line[0]) if m: for archive in files_hath: if m.group(1) == archive.gallery.gid: files_matched_hath.append( (line[0], archive.zipped.path, int(archive.filesize), archive)) for matched_file_hath in files_matched_hath: total_remote_size = 0 remote_ftp_tuples = [] for img_file_tuple in self.ftps.mlsd(path=matched_file_hath[0], facts=["type", "size"]): if img_file_tuple[1]["type"] != 'file' or img_file_tuple[0] == 'galleryinfo.txt': continue total_remote_size += int(img_file_tuple[1]["size"]) remote_ftp_tuples.append((img_file_tuple[0], img_file_tuple[1]["size"])) if total_remote_size != matched_file_hath[2]: self.logger.info( "For archive: {archive}, remote folder: {folder} " "has not completed the download ({current}/{total}), skipping".format( archive=matched_file_hath[3], folder=matched_file_hath[0], current=filesizeformat(total_remote_size), total=filesizeformat(matched_file_hath[2]) ) ) continue self.logger.info( "For archive: {archive}, downloading and creating zip " "for folder {filename}, {image_count} images".format( archive=matched_file_hath[3], filename=matched_file_hath[1], image_count=len(remote_ftp_tuples) )) dir_path = mkdtemp() self.current_download['total'] = len(remote_ftp_tuples) for count, remote_file in enumerate(sorted(remote_ftp_tuples), start=1): for retry_count in range(10): try: with open(os.path.join(dir_path, remote_file[0]), "wb") as file: self.current_download['index'] = count self.write_file_update_progress( 'RETR %s' % (str(matched_file_hath[0]) + "/" + remote_file[0]), file.write, int(remote_file[1]) ) except (ConnectionResetError, socket.timeout, TimeoutError): self.logger.error("Hath download failed for file {} of {}, restarting connection...".format( count, len(remote_ftp_tuples)) ) self.ftps.close() self.start_connection() self.set_current_dir(self.settings.providers['panda'].remote_hath_dir) else: break with ZipFile(os.path.join(self.settings.MEDIA_ROOT, matched_file_hath[1]), 'w') as archive_file: for (root_path, _, file_names) in os.walk(dir_path): for current_file in file_names: archive_file.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(dir_path, ignore_errors=True) self.process_downloaded_archive(matched_file_hath[3]) # Torrent downloads if len(files_torrent) > 0: self.set_current_dir(self.settings.ftps['remote_torrent_dir']) self.ftps.encoding = 'utf8' files_matched_torrent = [] for line in self.ftps.mlsd(facts=["type", "size"]): if not line[0]: continue if 'type' not in line[1]: continue if line[1]["type"] != 'dir' and line[1]["type"] != 'file': continue for archive in files_torrent: if archive.gallery: cleaned_torrent_name = os.path.splitext( os.path.basename(archive.zipped.path))[0].replace(' [' + archive.gallery.gid + ']', '') else: cleaned_torrent_name = os.path.splitext(os.path.basename(archive.zipped.path))[0] if replace_illegal_name(os.path.splitext(line[0])[0]) in cleaned_torrent_name: if line[1]["type"] == 'dir': files_matched_torrent.append((line[0], line[1]["type"], 0, archive)) else: files_matched_torrent.append((line[0], line[1]["type"], int(line[1]["size"]), archive)) for matched_file_torrent in files_matched_torrent: if matched_file_torrent[1] == 'dir': dir_path = mkdtemp() remote_ftp_files = list(self.ftps.mlsd(path=matched_file_torrent[0], facts=["type", "size"])) self.current_download['total'] = len(remote_ftp_files) self.logger.info( "For archive: {archive}, downloading and creating zip " "for folder {filename}, {image_count} images".format( archive=matched_file_torrent[3], filename=matched_file_torrent[0], image_count=len(remote_ftp_files) )) for count, img_file_tuple in enumerate(remote_ftp_files): if img_file_tuple[1]["type"] != 'file': continue for retry_count in range(10): try: with open(os.path.join(dir_path, img_file_tuple[0]), "wb") as file: self.current_download['index'] = count self.write_file_update_progress( 'RETR %s' % (str(matched_file_torrent[0]) + "/" + img_file_tuple[0]), file.write, int(img_file_tuple[1]["size"]) ) except (ConnectionResetError, socket.timeout, TimeoutError): self.logger.error("Torrent download failed for folder, restarting connection...") self.ftps.close() self.start_connection() self.set_current_dir(self.settings.ftps['remote_torrent_dir']) else: break with ZipFile(matched_file_torrent[3].zipped.path, 'w') as archive_file: for (root_path, _, file_names) in os.walk(dir_path): for current_file in file_names: archive_file.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(dir_path, ignore_errors=True) else: self.logger.info( "For archive: {archive} downloading remote file: {remote} to local file: {local}".format( archive=matched_file_torrent[3], remote=matched_file_torrent[0], local=matched_file_torrent[3].zipped.path ) ) self.current_download['total'] = 1 for retry_count in range(10): try: with open(matched_file_torrent[3].zipped.path, "wb") as file: self.current_download['index'] = 1 self.write_file_update_progress( 'RETR %s' % matched_file_torrent[0], file.write, matched_file_torrent[2]) except (ConnectionResetError, socket.timeout, TimeoutError): self.logger.error("Torrent download failed for archive, restarting connection...") self.ftps.close() self.start_connection() self.set_current_dir(self.settings.ftps['remote_torrent_dir']) else: break if self.settings.convert_rar_to_zip and os.path.splitext(matched_file_torrent[0])[1].lower() == ".rar": self.logger.info( "For archive: {}, converting rar: {} to zip".format( matched_file_torrent[3], matched_file_torrent[3].zipped.path ) ) convert_rar_to_zip(matched_file_torrent[3].zipped.path) self.process_downloaded_archive(matched_file_torrent[3]) self.ftps.close()
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return if self.own_settings.megadl_executable_path: exe_path_to_use = shutil.which( self.own_settings.megadl_executable_path) else: exe_path_to_use = shutil.which( self.own_settings.megadl_executable_name) if not exe_path_to_use: self.return_code = 0 self.logger.error("The megadl tools was not found") return directory_path = mkdtemp() arguments = [ "--no-progress", "--print-names", "--path", "{}".format(directory_path) ] if self.own_settings.proxy: arguments.append("--proxy") arguments.append("{}".format(self.own_settings.proxy)) if self.own_settings.extra_megadl_arguments: arguments.append("{}".format( self.own_settings.extra_megadl_arguments)) arguments.append("{}".format(self.gallery.link)) self.logger.info("Calling megadl: {}.".format(" ".join( [exe_path_to_use, *arguments]))) process_result = subprocess.run([exe_path_to_use, *arguments], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) message_text = process_result.stdout if not message_text: self.return_code = 0 self.logger.error( "The link could not be downloaded, no output was generated after running megadl" ) return if process_result.stderr: self.return_code = 0 self.logger.error( "An error was captured when running megadl: {}".format( process_result.stderr)) return if "WARNING: Skipping invalid" in message_text: self.return_code = 0 self.logger.error( "The link could not be downloaded: {}".format(message_text)) return # If we downloaded a folder, just take the first result file_names = message_text.splitlines() file_name = file_names[0] output_path = os.path.join(directory_path, file_name) if not os.path.isfile(output_path): self.return_code = 0 self.logger.error( "The resulting download file was not found: {}".format( file_name)) return self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, replace_illegal_name(file_name))) self.gallery.title = os.path.splitext(file_name)[0] filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) shutil.move(output_path, filepath) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: self.logger.error("Could not download archive") self.return_code = 0
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return to_use_filename = get_base_filename_string_from_gallery_data(self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join( self.own_settings.archive_dl_folder, to_use_filename + '.zip')) if self.gallery.content: soup_1 = BeautifulSoup(self.gallery.content, 'html.parser') else: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_page = requests.get( self.gallery.link, **request_dict ) soup_1 = BeautifulSoup(gallery_page.content, 'html.parser') gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href'] # Some URLs are really bad formatted gallery_read = re.sub( r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1', gallery_read, flags=re.DOTALL ) if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page): logger.warning("Reading gallery page not available, trying to guess the name.") gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery) if not gallery_read.endswith('page/1'): gallery_read += 'page/1' page_regex = re.compile(r"(.*?page/)(\d+)/*$", re.IGNORECASE) last_image = '' directory_path = mkdtemp() logger.info('Downloading gallery: {}'.format(self.gallery.title)) second_pass = False while True: try: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_read_page = requests.get( gallery_read, **request_dict ) except requests.exceptions.MissingSchema: logger.error("Malformed URL: {}, skipping".format(gallery_read)) self.return_code = 0 shutil.rmtree(directory_path, ignore_errors=True) return if gallery_read_page.status_code == 404: if gallery_read.endswith('page/1'): if not second_pass: gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False) second_pass = True continue logger.error("Last page was the first one: {}, stopping".format(gallery_read)) self.return_code = 0 shutil.rmtree(directory_path, ignore_errors=True) return # yield("Got to last gallery page, stopping") break soup_2 = BeautifulSoup(gallery_read_page.content, 'html.parser') img_find = soup_2.find("img", {"class": "open"}) if not img_find: logger.error("Gallery not available, skipping") self.return_code = 0 shutil.rmtree(directory_path, ignore_errors=True) return img = img_find['src'] if last_image != '' and last_image == img: # yield('Current image is the same as previous, skipping') break last_image = img img_name = os.path.basename(img) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get( img, **request_dict ) if request_file.status_code == 404: # yield("Got to last image, stopping") break with open(os.path.join(directory_path, img_name), "wb") as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) page_match = page_regex.search(gallery_read) if page_match: gallery_read = page_match.group(1) + str(int(page_match.group(2)) + 1) else: # yield("Could not match to change page, stopping") break file_path = os.path.join( self.settings.MEDIA_ROOT, self.gallery.filename ) with ZipFile(file_path, 'w') as archive: for (root_path, _, file_names) in os.walk(directory_path): for current_file in file_names: archive.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path) if self.gallery.filesize > 0: self.crc32 = calc_crc32(file_path) self.fileDownloaded = 1 self.return_code = 1
def start_download(self) -> None: if not self.gallery or not self.gallery.link: return to_use_filename = get_base_filename_string_from_gallery_data(self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join( self.own_settings.archive_dl_folder, to_use_filename + '.zip')) if self.gallery.content: soup_1 = BeautifulSoup(self.gallery.content, 'html.parser') else: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_page = requests.get( self.gallery.link, **request_dict ) soup_1 = BeautifulSoup(gallery_page.content, 'html.parser') gallery_read = soup_1.find("a", {"class": "x-btn-rounded"})['href'] # Some URLs are really bad formatted gallery_read = re.sub( r'.*(' + re.escape(constants.main_page) + r'/manga/read/.+/0/1/).*', r'\1', gallery_read, flags=re.DOTALL ) if not gallery_read or gallery_read in constants.bad_urls or not gallery_read.startswith(constants.main_page): logger.warning("Reading gallery page not available, trying to guess the name.") gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery) if not gallery_read.endswith('page/1'): gallery_read += 'page/1' logger.info('Downloading gallery: {}'.format(self.gallery.title)) try: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_read_page = requests.get( gallery_read, **request_dict ) except requests.exceptions.MissingSchema: logger.error("Malformed URL: {}, skipping".format(gallery_read)) self.return_code = 0 return if gallery_read_page.status_code != 200: gallery_read = guess_gallery_read_url(self.gallery.link, self.gallery, False) try: request_dict = construct_request_dict(self.settings, self.own_settings) gallery_read_page = requests.get( gallery_read, **request_dict ) except requests.exceptions.MissingSchema: logger.error("Malformed URL: {}, skipping".format(gallery_read)) self.return_code = 0 return if gallery_read_page.status_code == 200: image_urls = self.get_img_urls_from_gallery_read_page(gallery_read_page.text) if not image_urls: logger.error("Could not find image links, archive not downloaded") self.return_code = 0 return directory_path = mkdtemp() for image_url in image_urls: img_name = os.path.basename(image_url) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get( image_url, **request_dict ) if request_file.status_code == 404: logger.warning("Image link reported 404 error, stopping") break with open(os.path.join(directory_path, img_name), "wb") as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) file_path = os.path.join( self.settings.MEDIA_ROOT, self.gallery.filename ) with ZipFile(file_path, 'w') as archive: for (root_path, _, file_names) in os.walk(directory_path): for current_file in file_names: archive.write( os.path.join(root_path, current_file), arcname=os.path.basename(current_file)) shutil.rmtree(directory_path, ignore_errors=True) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo(file_path) if self.gallery.filesize > 0: self.crc32 = calc_crc32(file_path) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Wrong HTML code returned, could not download, link: {}".format(gallery_read)) self.return_code = 0
def start_download(self) -> None: if not self.gallery: return to_use_filename = get_base_filename_string_from_gallery_data( self.gallery) to_use_filename = replace_illegal_name(to_use_filename) self.gallery.filename = available_filename( self.settings.MEDIA_ROOT, os.path.join(self.own_settings.archive_dl_folder, to_use_filename + '.zip')) if not (self.gallery.root and self.gallery.gid and self.gallery.token and self.gallery.archiver_key): logger.error( 'Missing required data -> root: {}, gid: {}, token: {}, archiver_key: {}.' .format( self.gallery.root, self.gallery.gid, self.gallery.token, self.gallery.archiver_key, )) self.return_code = 0 return r = self.request_archive_download(self.gallery.root, self.gallery.gid, self.gallery.token, self.gallery.archiver_key) if not r: logger.error('Could not get download link.') self.return_code = 0 return r.encoding = 'utf-8' if 'Invalid archiver key' in r.text: logger.error("Invalid archiver key received.") self.return_code = 0 else: archive_link = get_archive_link_from_html_page(r.text) if archive_link == '': logger.error( 'Could not find archive link, page text: {}'.format( r.text)) self.return_code = 0 else: m = re.match(r"(.*?)(\?.*?)", archive_link) if m: archive_link = m.group(1) logger.info('Got link: {}, from url: {}'.format( archive_link, r.url)) request_dict = construct_request_dict(self.settings, self.own_settings) request_file = requests.get(archive_link + '?start=1', stream='True', **request_dict) if r and r.status_code == 200: logger.info( 'Downloading gallery: {}.zip'.format(to_use_filename)) filepath = os.path.join(self.settings.MEDIA_ROOT, self.gallery.filename) with open(filepath, 'wb') as fo: for chunk in request_file.iter_content(4096): fo.write(chunk) self.gallery.filesize, self.gallery.filecount = get_zip_fileinfo( filepath) if self.gallery.filesize > 0: self.crc32 = calc_crc32(filepath) self.fileDownloaded = 1 self.return_code = 1 else: logger.error("Could not download archive") self.return_code = 0
def start_crawling(self, arg_line: List[str]) -> None: args = self.get_args(arg_line) if isinstance(args, ArgumentParserError): self.logger.info(str(args)) return files = [] do_not_replace = False values: DataDict = {} if args.remove_missing_files: found_archives = Archive.objects.all() if found_archives: self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count())) for archive in found_archives: if not os.path.isfile(archive.zipped.path): Archive.objects.delete_by_filter( pk=archive.pk) return elif args.display_missing_files: found_archives = Archive.objects.all() if found_archives: self.logger.info("Checking {} archives for existence in filesystem".format(found_archives.count())) for archive in found_archives: if not os.path.isfile(archive.zipped.path): self.logger.info("Filename: {} doesn't exist".format(archive.zipped.path)) return elif args.rematch_non_matches: self.settings.rematch_file_list = ['non-match'] self.settings.rematch_file = True found_archives = Archive.objects.filter( match_type='non-match') if found_archives: self.logger.info("Scanning {} archives with non-matches".format(found_archives.count())) for archive in found_archives: if os.path.isfile(archive.zipped.path): files.append(archive.zipped.path) elif args.rematch_by_match_type: self.settings.rematch_file_list = [args.rematch_by_match_type] self.settings.rematch_file = True self.settings.replace_metadata = True found_archives = Archive.objects.filter( match_type=args.rematch_by_match_type) if found_archives: self.logger.info("Scanning {} archives matched by {}".format( found_archives.count(), args.rematch_by_match_type )) for archive in found_archives: if os.path.isfile(archive.zipped.path): files.append(archive.zipped.path) elif args.rematch_wrong_filesize: self.settings.rematch_file = True self.settings.replace_metadata = True do_not_replace = True found_archives = Archive.objects.exclude( match_type='non-match', gallery_id__isnull=True) if found_archives: for archive in found_archives: if not os.path.isfile(archive.zipped.path): continue if archive.filesize == archive.gallery.filesize: continue files.append(archive.zipped.path) self.logger.info("Scanning {} archives matched with wrong filesize".format(len(files))) elif args.recalc_missing_crc32: found_archives = Archive.objects.filter(crc32='') if found_archives: self.logger.info("Calculating {} archives with missing CRC32".format(found_archives.count())) for cnt, archive in enumerate(found_archives): if os.path.isfile(archive.zipped.path): crc32 = calc_crc32( archive.zipped.path) self.logger.info("Working on archive {} of {}, CRC32: {}".format((cnt + 1), found_archives.count(), crc32)) values = {'crc32': crc32} Archive.objects.add_or_update_from_values( values, pk=archive.pk) else: self.logger.info("Archive {} of {}, path: {} does not exist".format( (cnt + 1), found_archives.count(), archive.zipped.path )) return elif args.all_filenames_to_title: archives_title_gid = Archive.objects.exclude( title='') if archives_title_gid: self.logger.info("Checking {} galleries".format(archives_title_gid.count())) for cnt, archive in enumerate(archives_title_gid): current_path = os.path.join(os.path.dirname( archive.zipped.path), replace_illegal_name(archive.title) + '.zip') if archive.zipped.path != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)): self.logger.info("Filename should be {} but it's {}".format(current_path, archive.zipped.path)) if args.filename_to_title == 'rename': os.rename(archive.zipped.path, os.path.join( self.settings.MEDIA_ROOT, current_path)) values = {'zipped': current_path, } Archive.objects.add_or_update_from_values( values, pk=archive.pk) return elif args.rematch_from_internal_gallery_titles: non_matched_archives = Archive.objects.filter( match_type='non-match') if non_matched_archives: archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles() self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count())) for archive in non_matched_archives: adjusted_title = replace_illegal_name( os.path.basename(archive.zipped.path)).replace(".zip", "") galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, galleries_title_gid, args.rematch_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0])) values = { 'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title, 'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn, 'zipped': archive.zipped.path, 'crc32': archive.crc32, 'match_type': 'gallery_database', 'filesize': archive.filesize, 'filecount': archive.filecount, 'gallery_id': galleries_id_token[1] } Archive.objects.add_or_update_from_values( values, pk=archive.pk) Gallery.objects.update_by_dl_type( {"dl_type": "folder:filename"}, galleries_id_token[1], "failed") else: galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, archives_title_gid, args.rematch_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0])) values = { 'title': Gallery.objects.filter(id=galleries_id_token[1])[0].title, 'title_jpn': Gallery.objects.filter(id=galleries_id_token[1])[0].title_jpn, 'zipped': archive.zipped.path, 'crc32': archive.crc32, 'match_type': archive.match_type, 'filesize': archive.filesize, 'filecount': archive.filecount, 'gallery_id': galleries_id_token[1] } Archive.objects.add_or_update_from_values( values, pk=archive.pk) return elif args.display_match_from_internal_gallery_titles: non_matched_archives = Archive.objects.filter( match_type='non-match') if non_matched_archives: archives_title_gid, galleries_title_gid = self.get_archive_and_gallery_titles() self.logger.info("Matching against archive and gallery database, {} archives with no match".format(non_matched_archives.count())) for archive in non_matched_archives: adjusted_title = replace_illegal_name( os.path.basename(archive.zipped.path)).replace(".zip", "") galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, galleries_title_gid, args.display_match_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nGal title: {}".format(adjusted_title, galleries_id_token[0])) else: galleries_id_token = get_closer_gallery_title_from_list( adjusted_title, archives_title_gid, args.display_match_from_internal_gallery_titles) if galleries_id_token is not None: self.logger.info("Path: {}\nMatch title: {}".format(adjusted_title, galleries_id_token[0])) return else: for folder in args.folder: p = os.path.normpath(os.path.join(self.settings.MEDIA_ROOT, folder)) if not p.startswith(self.settings.MEDIA_ROOT): continue folder = os.path.relpath(p, self.settings.MEDIA_ROOT).replace("\\", "/") if os.path.isdir(os.path.join(self.settings.MEDIA_ROOT, folder)): for root, _, filenames in os.walk(os.path.join(self.settings.MEDIA_ROOT, str(folder))): for filename in fnmatch.filter(filenames, self.settings.filename_filter): files.append( os.path.relpath(os.path.join(root, filename), self.settings.MEDIA_ROOT)) elif os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, folder)): files.append(folder) if args.rename_to_title: self.logger.info("Checking {} galleries".format(len(files))) for cnt, filepath in enumerate(files): archive = Archive.objects.filter(zipped=filepath).first() if archive: current_path = os.path.join( os.path.dirname(filepath), replace_illegal_name(archive.title) + '.zip') if filepath != current_path and not os.path.isfile(os.path.join(self.settings.MEDIA_ROOT, current_path)): self.logger.info("Filename should be {} but it's {}".format(current_path, filepath)) if args.rename_to_title == 'rename': os.rename(os.path.join(self.settings.MEDIA_ROOT, filepath), os.path.join( self.settings.MEDIA_ROOT, current_path)) values = {'zipped': current_path, } Archive.objects.add_or_update_from_values( values, zipped=filepath) return if args.set_reason: self.settings.archive_reason = args.set_reason if args.set_source: self.settings.archive_source = args.set_source # The creation of the files list ends here. From here onwards, it's processing them. if len(files) == 0: self.logger.info("No file matching needed, skipping matchers") else: self.logger.info("Starting checks for {} archives".format(len(files))) matchers_list = self.settings.provider_context.get_matchers(self.settings, logger=self.logger) for matcher in matchers_list: self.logger.info("Using matcher {} with a priority of {}".format(matcher[0].name, matcher[1])) for cnt, filepath in enumerate(files): self.logger.info("Checking file: {} of {}, path: {}".format((cnt + 1), len(files), filepath)) title = re.sub( '[_]', ' ', os.path.splitext(os.path.basename(filepath))[0]) archive = Archive.objects.filter(zipped=filepath).first() if not self.settings.rehash_files and archive: crc32 = archive.crc32 else: crc32 = calc_crc32( os.path.join(self.settings.MEDIA_ROOT, filepath)) if archive: if args.force_rematch: self.logger.info("Doing a forced rematch") elif archive.match_type in self.settings.rematch_file_list or args.rematch_wrong_filesize: if self.settings.rematch_file: self.logger.info("File was already matched before, but rematch is ordered") else: self.logger.info("File was already matched before, not rematching") continue else: self.logger.info("Match already saved, skipping") continue else: # Test for corrupt files except_at_open = False return_error = None try: my_zip = ZipFile( os.path.join(self.settings.MEDIA_ROOT, filepath), 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: self.logger.warning("File check on zipfile failed on file: {}, marking as corrupt.".format(filepath)) values = { 'title': title, 'title_jpn': '', 'zipped': filepath, 'crc32': crc32, 'match_type': 'corrupt', 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'source_type': 'folder' } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) Archive.objects.update_or_create_by_values_and_gid( values, None, zipped=filepath) continue # Look for previous matches archive = Archive.objects.filter(crc32=crc32).first() if archive: if self.settings.copy_match_file: self.logger.info("Found previous match by CRC32, copying its values") values = { 'title': archive.title, 'title_jpn': archive.title_jpn, 'zipped': filepath, 'crc32': crc32, 'match_type': archive.match_type, 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'gallery_id': archive.gallery_id, 'source_type': archive.source_type } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) Archive.objects.add_or_update_from_values( values, zipped=filepath) continue else: self.logger.info("Matching independently and ignoring previous match") match_result = False start_time = time.perf_counter() match_type = '' match_title = '' match_link = '' match_count = 0 for i, matcher in enumerate(matchers_list): if i > 0: time.sleep(self.settings.wait_timer) self.logger.info("Matching with: {}".format(matcher[0])) if matcher[0].start_match(filepath, crc32): match_type = matcher[0].found_by match_title = matcher[0].match_title or '' match_link = matcher[0].match_link or '' match_count = matcher[0].match_count match_result = True break end_time = time.perf_counter() self.logger.info("Time taken to match file {}: {:.2f} seconds.".format(filepath, (end_time - start_time))) if not match_result and not do_not_replace: self.logger.info('Could not match with any matcher, adding as non-match.') values = { 'title': title, 'title_jpn': '', 'zipped': filepath, 'crc32': crc32, 'match_type': 'non-match', 'filesize': get_zip_filesize(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'filecount': filecount_in_zip(os.path.join(self.settings.MEDIA_ROOT, filepath)), 'source_type': 'folder' } if self.settings.archive_reason: values.update({'reason': self.settings.archive_reason}) if self.settings.archive_details: values.update({'details': self.settings.archive_details}) if self.settings.archive_source: values.update({'source_type': self.settings.archive_source}) archive = Archive.objects.update_or_create_by_values_and_gid( values, None, zipped=filepath) if self.settings.internal_matches_for_non_matches: self.logger.info('Generating possible internal matches.') archive.generate_possible_matches(cutoff=0.4, clear_title=True) self.logger.info('Generated matches for {}, found {}'.format( archive.zipped.path, archive.possible_matches.count() )) elif match_result: result_message = ( "Matched title: {}\n" "Matched link: {}\n" "Matched type: {}\n" "Match count: {}\n".format(match_title, match_link, match_type, match_count) ) self.logger.info(result_message) self.logger.info('Folder crawler done.')