def ConvertAllParseResultsToFileSeeds(all_parse_results, source_url, file_import_options): file_seeds = [] seen_urls = set() for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, (HC.URL_TYPE_DESIRED, ), only_get_top_priority=True) parsed_urls = HydrusData.DedupeList(parsed_urls) parsed_urls = [url for url in parsed_urls if url not in seen_urls] seen_urls.update(parsed_urls) # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up for url in parsed_urls: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url) file_seed.SetReferralURL(source_url) file_seed.AddParseResults(parse_results, file_import_options) file_seeds.append(file_seed) return file_seeds
def ForceLogins(self, domains_to_login): with self._lock: self._domains_to_login.extend(domains_to_login) self._domains_to_login = HydrusData.DedupeList( self._domains_to_login)
def WorkOnURL( self, gallery_seed_log, file_seed_cache, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, max_new_urls_allowed = None, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False try: ( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Did not have a parser for this URL!' ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( self.url ) status_hook( 'downloading page' ) if self._referral_url not in ( self.url, url_to_check ): referral_url = self._referral_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() data = network_job.GetContent() parsing_context = {} parsing_context[ 'gallery_url' ] = self.url parsing_context[ 'url' ] = url_to_check all_parse_results = parser.Parse( parsing_context, data ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'Could not parse any data!' ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total ) = ClientImporting.UpdateFileSeedCacheWithAllParseResults( file_seed_cache, all_parse_results, self.url, max_new_urls_allowed ) if max_new_urls_allowed is None: can_add_more_file_urls = True else: can_add_more_file_urls = num_urls_added < max_new_urls_allowed status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_add_more_file_urls: note += ' - hit file limit' # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_total > 0 and can_add_more_file_urls if self._can_generate_more_pages and can_add_more_gallery_urls: flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found' else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found, but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' next gallery pages found, but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = HydrusData.ToUnicode( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.ForbiddenException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404 )