示例#1
0
def ConvertAllParseResultsToFileSeeds(all_parse_results, source_url,
                                      file_import_options):

    file_seeds = []

    seen_urls = set()

    for parse_results in all_parse_results:

        parsed_urls = ClientParsing.GetURLsFromParseResults(
            parse_results, (HC.URL_TYPE_DESIRED, ), only_get_top_priority=True)

        parsed_urls = HydrusData.DedupeList(parsed_urls)

        parsed_urls = [url for url in parsed_urls if url not in seen_urls]

        seen_urls.update(parsed_urls)

        # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up

        for url in parsed_urls:

            file_seed = ClientImportFileSeeds.FileSeed(
                ClientImportFileSeeds.FILE_SEED_TYPE_URL, url)

            file_seed.SetReferralURL(source_url)

            file_seed.AddParseResults(parse_results, file_import_options)

            file_seeds.append(file_seed)

    return file_seeds
示例#2
0
    def ForceLogins(self, domains_to_login):

        with self._lock:

            self._domains_to_login.extend(domains_to_login)

            self._domains_to_login = HydrusData.DedupeList(
                self._domains_to_login)
示例#3
0
 def WorkOnURL( self, gallery_seed_log, file_seed_cache, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, max_new_urls_allowed = None, gallery_urls_seen_before = None ):
     
     if gallery_urls_seen_before is None:
         
         gallery_urls_seen_before = set()
         
     
     gallery_urls_seen_before.add( self.url )
     
     # maybe something like 'append urls' vs 'reverse-prepend' for subs or something
     
     # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
     
     num_urls_added = 0
     num_urls_already_in_file_seed_cache = 0
     num_urls_total = 0
     result_404 = False
     
     try:
         
         ( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.url )
         
         if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
             
             raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' )
             
         
         if not can_parse:
             
             raise HydrusExceptions.VetoException( 'Did not have a parser for this URL!' )
             
         
         ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( self.url )
         
         status_hook( 'downloading page' )
         
         if self._referral_url not in ( self.url, url_to_check ):
             
             referral_url = self._referral_url
             
         else:
             
             referral_url = None
             
         
         network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
         
         network_job.OverrideBandwidth( 30 )
         
         HG.client_controller.network_engine.AddJob( network_job )
         
         with network_job_presentation_context_factory( network_job ) as njpc:
             
             network_job.WaitUntilDone()
             
         
         data = network_job.GetContent()
         
         parsing_context = {}
         
         parsing_context[ 'gallery_url' ] = self.url
         parsing_context[ 'url' ] = url_to_check
         
         all_parse_results = parser.Parse( parsing_context, data )
         
         if len( all_parse_results ) == 0:
             
             raise HydrusExceptions.VetoException( 'Could not parse any data!' )
             
         
         title = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
         
         if title is not None:
             
             title_hook( title )
             
         
         ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total ) = ClientImporting.UpdateFileSeedCacheWithAllParseResults( file_seed_cache, all_parse_results, self.url, max_new_urls_allowed )
         
         if max_new_urls_allowed is None:
             
             can_add_more_file_urls = True
             
         else:
             
             can_add_more_file_urls = num_urls_added < max_new_urls_allowed
             
         
         status = CC.STATUS_SUCCESSFUL_AND_NEW
         
         note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found'
         
         if num_urls_already_in_file_seed_cache > 0:
             
             note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)'
             
         
         if not can_add_more_file_urls:
             
             note += ' - hit file limit'
             
         
         # only keep searching if we found any files, otherwise this could be a blank results page with another stub page
         can_add_more_gallery_urls = num_urls_total > 0 and can_add_more_file_urls
         
         if self._can_generate_more_pages and can_add_more_gallery_urls:
             
             flattened_results = list( itertools.chain.from_iterable( all_parse_results ) )
             
             next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True )
             
             if len( next_page_urls ) > 0:
                 
                 next_page_urls = HydrusData.DedupeList( next_page_urls )
                 
                 new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
                 
                 duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls )
                 
                 num_new_next_page_urls = len( new_next_page_urls )
                 num_dupe_next_page_urls = len( duplicate_next_page_urls )
                 
                 if num_new_next_page_urls > 0:
                     
                     next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ]
                     
                     gallery_seed_log.AddGallerySeeds( next_gallery_seeds )
                     
                     gallery_urls_seen_before.update( new_next_page_urls )
                     
                     if num_dupe_next_page_urls == 0:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found'
                         
                     else:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found, but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added'
                         
                     
                 else:
                     
                     note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' next gallery pages found, but they had already been visited this run and were not added'
                     
                 
             
         
         self.SetStatus( status, note = note )
         
     except HydrusExceptions.ShutdownException:
         
         pass
         
     except HydrusExceptions.VetoException as e:
         
         status = CC.STATUS_VETOED
         
         note = HydrusData.ToUnicode( e )
         
         self.SetStatus( status, note = note )
         
         if isinstance( e, HydrusExceptions.CancelledException ):
             
             status_hook( 'cancelled!' )
             
             time.sleep( 2 )
             
         
     except HydrusExceptions.ForbiddenException:
         
         status = CC.STATUS_VETOED
         note = '403'
         
         self.SetStatus( status, note = note )
         
         status_hook( '403' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except HydrusExceptions.NotFoundException:
         
         status = CC.STATUS_VETOED
         note = '404'
         
         self.SetStatus( status, note = note )
         
         status_hook( '404' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except Exception as e:
         
         status = CC.STATUS_ERROR
         
         self.SetStatus( status, exception = e )
         
         status_hook( 'error!' )
         
         time.sleep( 3 )
         
         if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever
             
             raise
             
         
     
     gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) )
     
     return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404 )