def job(self) -> None: while not self.stop.is_set(): seconds_to_wait = self.wait_until_next_run() if self.stop.wait(timeout=seconds_to_wait): return if self.settings.autoupdater.enable: current_settings = Settings(load_from_config=self.settings.config) current_settings.keep_dl_type = True current_settings.silent_processing = True current_settings.config['allowed']['replace_metadata'] = 'yes' connection.close() start_date = django_tz.now() - timedelta(seconds=int(self.timer)) - timedelta(days=self.settings.autoupdater.buffer_back) end_date = django_tz.now() - timedelta(days=self.settings.autoupdater.buffer_after) to_update_providers = current_settings.autoupdater.providers galleries = Gallery.objects.eligible_for_use( posted__gte=start_date, posted__lte=end_date, provider__in=to_update_providers ) if not galleries: logger.info( "No galleries posted from {} to {} need updating. Providers: {}".format( start_date, end_date, ", ".join(to_update_providers) ) ) else: # Leave only info downloaders, then leave only enabled auto updated providers downloaders = current_settings.provider_context.get_downloaders_name_priority(current_settings, filter_name='info') downloaders_names = [x[0] for x in downloaders if x[0].replace("_info", "") in to_update_providers] current_settings.allow_downloaders_only(downloaders_names, True, True, True) url_list = [x.get_link() for x in galleries] logger.info( "Starting timed auto updater, updating {} galleries " "posted from {} to {}. Providers: {}".format( len(url_list), start_date, end_date, ", ".join(to_update_providers) ) ) url_list.append('--update-mode') self.web_queue.enqueue_args_list(url_list, override_options=current_settings) self.update_last_run(django_tz.now())
def process_downloaded_archive(self, archive: Archive) -> None: if os.path.isfile(archive.zipped.path): except_at_open = False return_error = None try: my_zip = ZipFile( archive.zipped.path, 'r') return_error = my_zip.testzip() my_zip.close() except (BadZipFile, NotImplementedError): except_at_open = True if except_at_open or return_error: if 'panda' in archive.source_type: self.logger.error( "For archive: {}, file check on downloaded zipfile failed on file: {}, " "forcing download as panda_archive to fix it.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) Archive.objects.add_or_update_from_values({'crc32': crc32}, pk=archive.pk) if self.web_queue and archive.gallery: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list((archive.gallery.get_link(),), override_options=temp_settings) return else: self.logger.warning( "For archive: {}, File check on downloaded zipfile: {}. " "Check the file manually.".format(archive, archive.zipped.path) ) crc32 = calc_crc32( archive.zipped.path) filesize = get_zip_filesize( archive.zipped.path) filecount = filecount_in_zip( archive.zipped.path) values = {'crc32': crc32, 'filesize': filesize, 'filecount': filecount, } updated_archive = Archive.objects.add_or_update_from_values( values, pk=archive.pk) if archive.gallery and updated_archive.filesize != updated_archive.gallery.filesize: if Archive.objects.filter(gallery=updated_archive.gallery, filesize=updated_archive.gallery.filesize): self.logger.info( "For archive: {} size does not match gallery, " "but there's already another archive that matches.".format(updated_archive) ) return if 'panda' in archive.source_type: self.logger.info( "For archive: {} size does not match gallery, " "downloading again from panda_archive.".format(updated_archive) ) if self.web_queue: temp_settings = Settings(load_from_config=self.settings.config) temp_settings.allow_downloaders_only(['panda_archive'], True, True, True) self.web_queue.enqueue_args_list( (updated_archive.gallery.get_link(), ), override_options=temp_settings ) else: self.logger.warning( "For archive: {} size does not match gallery. Check the file manually.".format(archive) )
def json_parser(request: HttpRequest) -> HttpResponse: response = {} if request.method == 'POST': if not request.body: response['error'] = 'Empty request' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") data = json.loads(request.body.decode("utf-8")) if 'api_key' not in data: response['error'] = 'Missing API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") elif data['api_key'] != crawler_settings.api_key: response['error'] = 'Incorrect API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") # send some 'ok' back else: if 'operation' not in data or 'args' not in data: response['error'] = 'Wrong format' else: args = data['args'] response = {} # Used by internal pages and userscript if data['operation'] == 'webcrawler' and 'link' in args: if not crawler_settings.workers.web_queue: response['error'] = 'The webqueue is not running' elif 'downloader' in args: current_settings = Settings(load_from_config=crawler_settings.config) if not current_settings.workers.web_queue: response['error'] = 'The webqueue is not running' else: current_settings.allow_downloaders_only([args['downloader']], True, True, True) archive = None parsers = current_settings.provider_context.get_parsers(current_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['link'], )) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break current_settings.workers.web_queue.enqueue_args_list((args['link'],), override_options=current_settings) if archive: response['message'] = "Archive exists, crawling to check for redownload: " + args['link'] else: response['message'] = "Crawling: " + args['link'] else: if 'parentLink' in args: parent_archive = None parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['parentLink'],)) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: parent_archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break if parent_archive: link = parent_archive.gallery.get_link() if 'action' in args and args['action'] == 'replaceFound': parent_archive.gallery.mark_as_deleted() parent_archive.gallery = None parent_archive.delete_all_files() parent_archive.delete_files_but_archive() parent_archive.delete() response['message'] = "Crawling: " + args['link'] + ", deleting parent: " + link crawler_settings.workers.web_queue.enqueue_args(args['link']) elif 'action' in args and args['action'] == 'queueFound': response['message'] = "Crawling: " + args['link'] + ", keeping parent: " + link crawler_settings.workers.web_queue.enqueue_args(args['link']) else: response['message'] = "Please confirm deletion of parent: " + link response['action'] = 'confirmDeletion' else: archive = None parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['link'],)) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break if archive: response['message'] = "Archive exists, crawling to check for redownload: " + args['link'] else: response['message'] = "Crawling: " + args['link'] crawler_settings.workers.web_queue.enqueue_args(args['link']) else: archive = None parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls((args['link'],)) for url_filtered in urls_filtered: gallery_gid = parser.id_from_url(url_filtered) if gallery_gid: archive = Archive.objects.filter(gallery__gid=gallery_gid).first() if urls_filtered: break if archive: response['message'] = "Archive exists, crawling to check for redownload: " + args['link'] else: response['message'] = "Crawling: " + args['link'] crawler_settings.workers.web_queue.enqueue_args(args['link']) if not response: response['error'] = 'Could not parse request' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") # Used by remotesite command elif data['operation'] == 'archive_request': archives_query = Archive.objects.filter_non_existent(crawler_settings.MEDIA_ROOT, gallery__gid__in=args) archives = [{'gid': archive.gallery.gid, 'id': archive.id, 'zipped': archive.zipped.name, 'filesize': archive.filesize} for archive in archives_query] response_text = json.dumps({'result': archives}) return HttpResponse(response_text, content_type="application/json; charset=utf-8") # Used by remotesite command elif data['operation'] in ('queue_archives', 'queue_galleries'): urls = args new_urls_set = set() gids_set = set() parsers = crawler_settings.provider_context.get_parsers(crawler_settings, crawler_logger) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls(urls) for url in urls_filtered: gid = parser.id_from_url(url) gids_set.add(gid) gids_list = list(gids_set) existing_galleries = Gallery.objects.filter(gid__in=gids_list) for gallery_object in existing_galleries: if gallery_object.is_submitted(): gallery_object.delete() # Delete queue galleries that failed, and does not have archives. elif data['operation'] == 'queue_archives' and "failed" in gallery_object.dl_type and not gallery_object.archive_set.all(): gallery_object.delete() elif data['operation'] == 'queue_archives' and not gallery_object.archive_set.all(): gallery_object.delete() already_present_gids = list(Gallery.objects.filter(gid__in=gids_list).values_list('gid', flat=True)) # new_gids = list(gids_set - set(already_present_gids)) for parser in parsers: if parser.id_from_url_implemented(): urls_filtered = parser.filter_accepted_urls(urls) for url in urls_filtered: gid = parser.id_from_url(url) if gid not in already_present_gids: new_urls_set.add(url) pages_links = list(new_urls_set) if len(pages_links) > 0: current_settings = Settings(load_from_config=crawler_settings.config) if data['operation'] == 'queue_galleries': current_settings.allow_type_downloaders_only('info') elif data['operation'] == 'queue_archives': if 'archive_reason' in data: current_settings.archive_reason = data['archive_reason'] if 'archive_details' in data: current_settings.archive_details = data['archive_details'] current_settings.allow_type_downloaders_only('fake') if current_settings.workers.web_queue: current_settings.workers.web_queue.enqueue_args_list(pages_links, override_options=current_settings) else: pages_links = [] return HttpResponse(json.dumps({'result': str(len(pages_links))}), content_type="application/json; charset=utf-8") # Used by remotesite command elif data['operation'] == 'links': links = args if len(links) > 0: crawler_settings.workers.web_queue.enqueue_args_list(links) return HttpResponse(json.dumps({'result': str(len(links))}), content_type="application/json; charset=utf-8") # Used by archive page elif data['operation'] == 'match_archive': archive = Archive.objects.filter(pk=args['archive']) if archive: generate_possible_matches_for_archives( archive, filters=(args['match_filter'],), logger=crawler_logger, match_local=False, match_web=True, ) return HttpResponse(json.dumps({'message': 'web matcher done, check the logs for results'}), content_type="application/json; charset=utf-8") elif data['operation'] == 'match_archive_internally': archive = Archive.objects.get(pk=args['archive']) if archive: clear_title = True if 'clear' in args else False provider_filter = args.get('provider', '') try: cutoff = float(request.GET.get('cutoff', '0.4')) except ValueError: cutoff = 0.4 try: max_matches = int(request.GET.get('max-matches', '10')) except ValueError: max_matches = 10 archive.generate_possible_matches( clear_title=clear_title, provider_filter=provider_filter, cutoff=cutoff, max_matches=max_matches ) archive.save() return HttpResponse(json.dumps({'message': 'internal matcher done, check the archive for results'}), content_type="application/json; charset=utf-8") else: response['error'] = 'Unknown function' elif request.method == 'GET': data = request.GET if 'api_key' not in data: response['error'] = 'Missing API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") elif data['api_key'] != crawler_settings.api_key: response['error'] = 'Incorrect API key' return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8") # send some 'ok' back else: if 'gc' in data: args = data.copy() for k in gallery_filter_keys: if k not in args: args[k] = '' keys = ("sort", "asc_desc") for k in keys: if k not in args: args[k] = '' # args = data # Already authorized by api key. args['public'] = False results = filter_galleries_no_request(args) if not results: return HttpResponse(json.dumps([]), content_type="application/json; charset=utf-8") response_text = json.dumps( [{ 'gid': gallery.gid, 'token': gallery.token, 'title': gallery.title, 'title_jpn': gallery.title_jpn, 'category': gallery.category, 'uploader': gallery.uploader, 'comment': gallery.comment, 'posted': int(timestamp_or_zero(gallery.posted)), 'filecount': gallery.filecount, 'filesize': gallery.filesize, 'expunged': gallery.expunged, 'rating': gallery.rating, 'hidden': gallery.hidden, 'fjord': gallery.fjord, 'public': gallery.public, 'provider': gallery.provider, 'dl_type': gallery.dl_type, 'tags': gallery.tag_list(), 'link': gallery.get_link(), 'thumbnail': request.build_absolute_uri(reverse('viewer:gallery-thumb', args=(gallery.pk,))) if gallery.thumbnail else '', 'thumbnail_url': gallery.thumbnail_url } for gallery in results ], # indent=2, sort_keys=True, ensure_ascii=False, ) return HttpResponse(response_text, content_type="application/json; charset=utf-8") else: response['error'] = 'Unknown function' else: response['error'] = 'Unsupported method: {}'.format(request.method) return HttpResponse(json.dumps(response), content_type="application/json; charset=utf-8")