Exemplos de ClientParsing em Python, exemplos de ClientParsing em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: ClientNetworkingDomain.py Projeto: velemi/hydrus

    def __init__(
            self,
            name,
            url_type=None,
            preferred_scheme='https',
            netloc='hostname.com',
            allow_subdomains=False,
            keep_subdomains=False,
            path_components=None,
            parameters=None,
            example_url='https://hostname.com/post/page.php?id=123456&s=view'):

        if url_type is None:

            url_type = HC.URL_TYPE_POST

        if path_components is None:

            path_components = HydrusSerialisable.SerialisableList()

            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='post',
                    example_string='post'))
            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='page.php',
                    example_string='page.php'))

        if parameters is None:

            parameters = HydrusSerialisable.SerialisableDictionary()

            parameters['s'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FIXED,
                match_value='view',
                example_string='view')
            parameters['id'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FLEXIBLE,
                match_value=ClientParsing.NUMERIC,
                example_string='123456')

        # if the args are not serialisable stuff, lets overwrite here

        path_components = HydrusSerialisable.SerialisableList(path_components)
        parameters = HydrusSerialisable.SerialisableDictionary(parameters)

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._url_type = url_type
        self._preferred_scheme = preferred_scheme
        self._netloc = netloc
        self._allow_subdomains = allow_subdomains
        self._keep_subdomains = keep_subdomains
        self._path_components = path_components
        self._parameters = parameters

        self._example_url = example_url

Exemplo n.º 2

0

Exibir arquivo

Arquivo: ClientNetworkingDomain.py Projeto: kororok/hydrus

    def __init__(
            self,
            name,
            preferred_scheme='https',
            netloc='hostname.com',
            subdomain_is_important=False,
            path_components=None,
            parameters=None,
            example_url='https://hostname.com/post/page.php?id=123456&s=view'):

        if path_components is None:

            path_components = HydrusSerialisable.SerialisableList()

            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='post',
                    example_string='post'))
            path_components.append(
                ClientParsing.StringMatch(
                    match_type=ClientParsing.STRING_MATCH_FIXED,
                    match_value='page.php',
                    example_string='page.php'))

        if parameters is None:

            parameters = HydrusSerialisable.SerialisableDictionary()

            parameters['s'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FIXED,
                match_value='view',
                example_string='view')
            parameters['id'] = ClientParsing.StringMatch(
                match_type=ClientParsing.STRING_MATCH_FLEXIBLE,
                match_value=ClientParsing.NUMERIC,
                example_string='123456')

        # an edit dialog panel for this that has example url and testing of current values
        # a parent panel or something that lists all current urls in the db that match and how they will be clipped, is this ok? kind of thing.

        HydrusSerialisable.SerialisableBaseNamed.__init__(self, name)

        self._preferred_scheme = preferred_scheme
        self._netloc = netloc
        self._subdomain_is_important = subdomain_is_important
        self._path_components = path_components
        self._parameters = parameters

        self._example_url = example_url

Exemplo n.º 3

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseGalleryPage(self, html, url_base):

        definitely_no_more_pages = False

        urls = []

        soup = ClientParsing.GetSoup(html)

        thumbnail_links = soup.find_all(class_='work')

        thumbnail_urls = [
            thumbnail_link['href'] for thumbnail_link in thumbnail_links
        ]

        for thumbnail_url in thumbnail_urls:

            url = urlparse.urljoin(
                url_base, thumbnail_url
            )  # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690

            urls.append(url)

        urls_and_tags = [(url, set()) for url in urls]

        return (urls_and_tags, definitely_no_more_pages)

Exemplo n.º 4

0

Exibir arquivo

def ConvertAllParseResultsToFileSeeds(all_parse_results, source_url,
                                      file_import_options):

    file_seeds = []

    seen_urls = set()

    for parse_results in all_parse_results:

        parsed_urls = ClientParsing.GetURLsFromParseResults(
            parse_results, (HC.URL_TYPE_DESIRED, ), only_get_top_priority=True)

        parsed_urls = HydrusData.DedupeList(parsed_urls)

        parsed_urls = [url for url in parsed_urls if url not in seen_urls]

        seen_urls.update(parsed_urls)

        # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up

        for url in parsed_urls:

            file_seed = ClientImportFileSeeds.FileSeed(
                ClientImportFileSeeds.FILE_SEED_TYPE_URL, url)

            file_seed.SetReferralURL(source_url)

            file_seed.AddParseResults(parse_results, file_import_options)

            file_seeds.append(file_seed)

    return file_seeds

Exemplo n.º 5

0

Exibir arquivo

 def _ParseImagePage( self, html, url_base ):
     
     soup = ClientParsing.GetSoup( html )
     
     tags = set()
     
     author_links = soup.find( 'ul', class_ = 'authorlinks' )
     
     if author_links is not None:
         
         authors = set()
         
         links = author_links.find_all( 'a' )
         
         for link in links:
             
             try:
                 
                 href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com
                 
                 creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )
                 
                 tags.add( u'creator:' + creator )
                 
             except: pass
             
         
     
     try:
         
         title = soup.find( 'title' )
         
         tags.add( u'title:' + title.string )
         
     except: pass
     
     all_links = soup.find_all( 'a' )
     
     for link in all_links:
         
         try:
             
             href = link[ 'href' ]
             
             if '/browse/tag/' in href: tags.add( link.string )
             
         except: pass
         
     
     #
     
     flash_url = html.split( '"http:\/\/uploads.ungrounded.net\/', 1 )[1]
     
     flash_url = flash_url.split( '"', 1 )[0]
     
     flash_url = flash_url.replace( "\/", '/' )
     
     flash_url = 'http://uploads.ungrounded.net/' + flash_url
     
     return ( flash_url, tags )

Exemplo n.º 6

0

Exibir arquivo

    def _UpdateSerialisableInfo(self, version, old_serialisable_info):

        if version == 1:

            (url_type, preferred_scheme, netloc, allow_subdomains,
             keep_subdomains, serialisable_path_components,
             serialisable_parameters, example_url) = old_serialisable_info

            url_match_key = HydrusData.GenerateKey()

            serialisable_url_match_key = url_match_key.encode('hex')

            api_lookup_converter = ClientParsing.StringConverter(
                example_string=example_url)

            serialisable_api_lookup_converter = api_lookup_converter.GetSerialisableTuple(
            )

            new_serialisable_info = (serialisable_url_match_key, url_type,
                                     preferred_scheme, netloc,
                                     allow_subdomains, keep_subdomains,
                                     serialisable_path_components,
                                     serialisable_parameters,
                                     serialisable_api_lookup_converter,
                                     example_url)

            return (2, new_serialisable_info)

Exemplo n.º 7

0

Exibir arquivo

 def LoginTumblrGDPR( self ):
     
     # t-thanks, EU
     # this is cribbed from poking around here https://github.com/johanneszab/TumblThree/commit/3563d6cebf1a467151d6b8d6eee9806ddd6e6364
     
     network_job = ClientNetworkingJobs.NetworkJob( 'GET', 'http://www.tumblr.com/' )
     
     network_job.SetForLogin( True )
     
     self.engine.AddJob( network_job )
     
     network_job.WaitUntilDone()
     
     html = network_job.GetContent()
     
     formula = ClientParsing.ParseFormulaHTML( tag_rules = [ ClientParsing.ParseRuleHTML( rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING, tag_name = 'meta', tag_attributes = { 'id' : 'tumblr_form_key' } ) ], content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = "content" )
     
     results = formula.Parse( {}, html )
     
     if len( results ) != 1:
         
         raise HydrusExceptions.ParseException( 'Could not figure out the tumblr form key for the GDPR click-through.' )
         
     
     tumblr_form_key = results[0]
     
     #
     
     body = '{\"eu_resident\":true,\"gdpr_is_acceptable_age\":true,\"gdpr_consent_core\":true,\"gdpr_consent_first_party_ads\":true,\"gdpr_consent_third_party_ads\":true,\"gdpr_consent_search_history\":true,\"redirect_to\":\"\"}'
     referral_url = 'https://www.tumblr.com/privacy/consent?redirect='
     
     network_job = ClientNetworkingJobs.NetworkJob( 'POST', 'https://www.tumblr.com/svc/privacy/consent', body = body, referral_url = referral_url )
     
     network_job.SetForLogin( True )
     
     network_job.AddAdditionalHeader( 'Accept', 'application/json, text/javascript, */*; q=0.01')
     network_job.AddAdditionalHeader( 'Content-Type', 'application/json' )
     network_job.AddAdditionalHeader( 'X-Requested-With', 'XMLHttpRequest' )
     network_job.AddAdditionalHeader( 'X-tumblr-form-key', tumblr_form_key )
     
     self.engine.AddJob( network_job )
     
     network_job.WaitUntilDone()
     
     # test cookies here or something
     
     HydrusData.ShowText( 'Looks like tumblr GDPR click-through worked! You should be good for a year, at which point we should have an automatic solution for this!' )

Exemplo n.º 8

0

Exibir arquivo

Arquivo: ClientGUITagSuggestions.py Projeto: ipatrol/hydrus

    def THREADFetchTags(self, script, job_key, file_identifier,
                        desired_content):

        content_results = script.DoQuery(job_key, file_identifier,
                                         desired_content)

        tags = ClientParsing.GetTagsFromContentResults(content_results)

        wx.CallAfter(self._SetTags, tags)

Exemplo n.º 9

0

Exibir arquivo

 def _ParseGalleryPage( self, html, url_base ):
     
     definitely_no_more_pages = False
     
     urls_set = set()
     
     soup = ClientParsing.GetSoup( html )
     
     def correct_url( href ):
         
         if href is None:
             
             return False
             
         
         # a good url is in the form "/pictures/user/artist_name/file_id/title"
         
         if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
             
             ( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
             
             # /pictures/user/artist_name/page/3
             if file_id != 'page':
                 
                 return True
                 
             
         
         return False
         
     
     urls = []
     
     links = soup.find_all( 'a', href = correct_url )
     
     for link in links:
         
         url = 'http://www.hentai-foundry.com' + link['href']
         
         if url not in urls_set:
             
             urls_set.add( url )
             
             urls.append( url )
             
         
     
     # this is copied from old code. surely we can improve it?
     if 'class="next"' not in html:
         
         definitely_no_more_pages = True
         
     
     urls_and_tags = [ ( url, set() ) for url in urls ]
     
     return ( urls_and_tags, definitely_no_more_pages )

Exemplo n.º 10

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseImagePage(self, html, page_url):

        if 'member_illust.php?mode=manga' in html:

            manga_url = page_url.replace('medium', 'manga')

            raise HydrusExceptions.MimeException(
                page_url +
                ' was manga, not a single image, so could not be downloaded.')

        if 'member_illust.php?mode=ugoira_view' in html:

            raise HydrusExceptions.MimeException(
                page_url +
                ' was ugoira, not a single image, so could not be downloaded.')

        soup = ClientParsing.GetSoup(html)

        #

        original_image = soup.find(class_='original-image')

        image_url = original_image[
            'data-src']  # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg

        #

        tags_parent = soup.find('section', class_='work-tags')

        # <a href="/search.php?s_mode=s_tag_full&amp;word=%E3%83%8F%E3%83%B3%E3%83%89%E3%83%A1%E3%82%A4%E3%83%89" class="text">[unicode tag here]</a>
        tags = [
            link.string for link in tags_parent.find_all('a', class_='text')
        ]

        user = soup.find('h1', class_='user')

        if user is not None:

            tags.append('creator:' + user.string)

        title_parent = soup.find('section', class_=re.compile('work-info'))

        if title_parent is not None:

            title = title_parent.find('h1', class_='title')

            if title is not None:

                tags.append('title:' + title.string)

        return (image_url, tags)

Exemplo n.º 11

0

Exibir arquivo

 def THREADFetchTags( self, script, job_key, file_identifier ):
     
     def wx_code( tags ):
         
         if not self:
             
             return
             
         
         self._SetTags( tags )
         
     
     parse_results = script.DoQuery( job_key, file_identifier )
     
     tags = ClientParsing.GetTagsFromParseResults( parse_results )
     
     wx.CallAfter( wx_code, tags )

Exemplo n.º 12

0

Exibir arquivo

def UpdateFileSeedCacheWithAllParseResults(file_seed_cache,
                                           all_parse_results,
                                           source_url,
                                           max_new_urls_allowed=None):

    new_file_seeds = []

    num_urls_added = 0
    num_urls_already_in_file_seed_cache = 0
    num_urls_total = 0

    for parse_results in all_parse_results:

        parsed_urls = ClientParsing.GetURLsFromParseResults(
            parse_results, (HC.URL_TYPE_DESIRED, ), only_get_top_priority=True)

        for url in parsed_urls:

            num_urls_total += 1

            if max_new_urls_allowed is not None and num_urls_added == max_new_urls_allowed:

                continue

            file_seed = ClientImportFileSeeds.FileSeed(
                ClientImportFileSeeds.FILE_SEED_TYPE_URL, url)

            file_seed.SetReferralURL(source_url)

            if file_seed_cache.HasFileSeed(file_seed):

                num_urls_already_in_file_seed_cache += 1

            else:

                num_urls_added += 1

                file_seed.AddParseResults(parse_results)

                new_file_seeds.append(file_seed)

    file_seed_cache.AddFileSeeds(new_file_seeds)

    return (num_urls_added, num_urls_already_in_file_seed_cache,
            num_urls_total)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: ClientNetworkingLogin.py Projeto: antonpaquin/hydrus

    def LoginPixiv(self, network_context, pixiv_id, password):

        session = self.engine.session_manager.GetSession(network_context)

        response = session.get('https://accounts.pixiv.net/login')

        soup = ClientParsing.GetSoup(response.content)

        # some whocking 20kb bit of json tucked inside a hidden form input wew lad
        i = soup.find('input', id='init-config')

        raw_json = i['value']

        j = json.loads(raw_json)

        if 'pixivAccount.postKey' not in j:

            raise HydrusExceptions.ForbiddenException(
                'When trying to log into Pixiv, I could not find the POST key! This is a problem with hydrus\'s pixiv parsing, not your login! Please contact hydrus dev!'
            )

        post_key = j['pixivAccount.postKey']

        form_fields = {}

        form_fields['pixiv_id'] = pixiv_id
        form_fields['password'] = password
        form_fields['captcha'] = ''
        form_fields['g_recaptcha_response'] = ''
        form_fields['return_to'] = 'https://www.pixiv.net'
        form_fields['lang'] = 'en'
        form_fields['post_key'] = post_key
        form_fields['source'] = 'pc'

        headers = {}

        headers[
            'referer'] = "https://accounts.pixiv.net/login?lang=en^source=pc&view_type=page&ref=wwwtop_accounts_index"
        headers['origin'] = "https://accounts.pixiv.net"

        session.post('https://accounts.pixiv.net/api/login?lang=en',
                     data=form_fields,
                     headers=headers)

        time.sleep(1)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

def Parse4chanPostScreen(html):

    soup = ClientParsing.GetSoup(html)

    title_tag = soup.find('title')

    if title_tag.string == 'Post successful!': return ('success', None)
    elif title_tag.string == '4chan - Banned':

        HydrusData.Print(soup)

        text = 'You are banned from this board! html written to log.'

        HydrusData.ShowText(text)

        return ('big error', text)

    else:

        try:

            problem_tag = soup.find(id='errmsg')

            if problem_tag is None:

                HydrusData.Print(soup)

                text = 'Unknown problem; html written to log.'

                HydrusData.ShowText(text)

                return ('error', text)

            problem = HydrusData.ToUnicode(problem_tag)

            if 'CAPTCHA' in problem: return ('captcha', None)
            elif 'seconds' in problem: return ('too quick', None)
            elif 'Duplicate' in problem:
                return ('error', 'duplicate file detected')
            else:
                return ('error', problem)

        except:
            return ('error', 'unknown error')

Exemplo n.º 15

0

Exibir arquivo

 def _ParseGalleryPage( self, html, url_base ):
     
     soup = ClientParsing.GetSoup( html )
     
     fatcol = soup.find( 'div', class_ = 'fatcol' )
     
     if fatcol is not None:
         
         links = fatcol.find_all( 'a' )
         
     else:
         
         links = []
         
     
     urls_set = set()
     
     urls = []
     
     for link in links:
         
         try:
             
             url = link[ 'href' ]
             
             if url not in urls_set:
                 
                 if url.startswith( 'http://www.newgrounds.com/portal/view/' ): 
                     
                     urls_set.add( url )
                     
                     urls.append( url )
                     
                 
             
         except: pass
         
     
     definitely_no_more_pages = True
     
     urls_and_tags = [ ( url, set() ) for url in urls ]
     
     return ( urls_and_tags, definitely_no_more_pages )

Exemplo n.º 16

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseGalleryPage(self, html, url_base):

        definitely_no_more_pages = False

        urls_and_tags = []

        soup = ClientParsing.GetSoup(html)

        thumbs_container = soup.find('div', class_='torpedo-container')

        thumbs = thumbs_container.find_all('span', class_='thumb')

        for thumb in thumbs:

            url = thumb[
                'href']  # something in the form of blah.da.com/art/blah-123456

            tags = []

            urls_and_tags.append((url, tags))

        return (urls_and_tags, definitely_no_more_pages)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseImagePage(self, html, url_base):

        # can't parse this easily normally because HF is a pain with the preview->click to see full size business.
        # find http://pictures.hentai-foundry.com//
        # then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144/image.jpg
        # the .jpg bit is what we really need, but whatever

        # an example of this:
        # http://www.hentai-foundry.com/pictures/user/Sparrow/440257/Meroulix-LeBeau

        # addendum:
        # some users put pictures.hentai-foundry.com links in their profile images, which then gets repeated up above in some <meta> tag
        # so, lets limit this search to a smaller bit of html

        # example of this:
        # http://www.hentai-foundry.com/pictures/user/teku/572881/Special-Gang-Bang

        try:

            image_soup = ClientParsing.GetSoup(html)

            image_html = unicode(image_soup.find('section', id='picBox'))

            index = image_html.index('pictures.hentai-foundry.com')

            image_url = image_html[index:index + 256]

            if '"' in image_url:

                (image_url, gumpf) = image_url.split('"', 1)

            if '&#039;' in image_url:

                (image_url, gumpf) = image_url.split('&#039;', 1)

            image_url = 'http://' + image_url

        except Exception as e:

            raise Exception('Could not parse image url!' + os.linesep +
                            HydrusData.ToUnicode(e))

        soup = ClientParsing.GetSoup(html)

        tags = []

        try:

            title = soup.find('title')

            (data, nothing) = title.string.split(' - Hentai Foundry')

            data_reversed = data[::
                                 -1]  # want to do it right-side first, because title might have ' by ' in it

            (artist_reversed, title_reversed) = data_reversed.split(' yb ')

            artist = artist_reversed[::-1]

            title = title_reversed[::-1]

            tags.append('creator:' + artist)
            tags.append('title:' + title)

        except:
            pass

        return (image_url, tags)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseImagePage(self, html, referral_url):

        img_url = None

        soup = ClientParsing.GetSoup(html)

        download_button = soup.find('a', class_='dev-page-download')

        if download_button is None:

            # this method maxes out at 1024 width

            img = soup.find(class_='dev-content-full')

            if img is None:

                # nsfw

                # used to fetch this from a tumblr share url, now we grab from some hidden gubbins behind an age gate

                a_ismatures = soup.find_all('a', class_='ismature')

                imgs = []

                for a_ismature in a_ismatures:

                    imgs.extend(a_ismature.find_all('img'))

                for img in imgs:

                    # <img   width="150" height="75" alt="Jelly gals by ArtInCase" src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" data-src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" srcset="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w" sizes="150px">

                    if img.has_attr('srcset'):

                        # http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w
                        # the last url here is what we want

                        srcset = img['srcset']

                        # 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg

                        gubbins_and_url = srcset.split(' ')[-2]

                        img_url = gubbins_and_url.split(',')[1]

                        break

            else:

                img_url = img['src']

        else:

            # something like http://www.deviantart.com/download/518046750/varda_and_the_sacred_trees_of_valinor_by_implosinoatic-d8kfjfi.jpg?token=476cb73aa2ab22bb8554542bc9f14982e09bd534&ts=1445717843
            # given the right cookies, it redirects to the truly fullsize image_url
            # otherwise, it seems to redirect to a small interstitial redirect page that heads back to the original image page

            img_url = download_button['href']

        if img_url is None:

            raise HydrusExceptions.ParseException(
                'Could not find a download link--maybe this work was text?')

        return img_url

Exemplo n.º 19

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseImagePage(self, html, url_base):

        (search_url, search_separator, advance_by_page_num, thumb_classname,
         image_id, image_data,
         tag_classnames_to_namespaces) = self._booru.GetData()

        soup = ClientParsing.GetSoup(html)

        image_url = None

        try:

            if image_id is not None:

                image = soup.find(id=image_id)

                if image is None:

                    image_string = soup.find(text=re.compile('Save this file'))

                    if image_string is None:

                        image_string = soup.find(
                            text=re.compile('Save this video'))

                    if image_string is None:

                        # catchall for rule34hentai.net's webms

                        if image_url is None:

                            a_tags = soup.find_all('a')

                            for a_tag in a_tags:

                                href = a_tag['href']

                                if href is not None:

                                    if href.endswith('.webm'):

                                        image_url = href

                                        break

                        # catchall for rule34hentai.net's mp4s, which are loaded in a mickey-mouse flv player

                        if image_url is None:

                            magic_phrase = 'document.write("<source src=\''

                            if magic_phrase in html:

                                # /image/252605' type='video/mp4...

                                image_url_and_gumpf = html.split(
                                    magic_phrase, 1)[1]

                                image_url = image_url_and_gumpf.split('\'',
                                                                      1)[0]

                    else:

                        image = image_string.parent

                        image_url = image['href']

                else:

                    if image.name in ('img', 'video'):

                        image_url = image['src']

                        if 'Running Danbooru' in html:

                            # possible danbooru resized image

                            possible_better_image = soup.find(
                                id='image-resize-link')

                            if possible_better_image is not None:

                                image_url = possible_better_image['href']

                    elif image.name == 'a':

                        image_url = image['href']

            if image_data is not None:

                links = soup.find_all('a')

                ok_link = None
                better_link = None

                for link in links:

                    if link.string is not None:

                        if link.string.startswith(
                                image_data) or link.string.endswith(
                                    image_data):

                            ok_link = link['href']

                        if link.string.startswith('Download PNG'):

                            better_link = link['href']

                            break

                if better_link is not None:

                    image_url = better_link

                else:

                    image_url = ok_link

        except Exception as e:

            raise HydrusExceptions.DataMissing(
                'Could not parse a download link for ' + url_base + '!' +
                os.linesep + HydrusData.ToUnicode(e))

        if image_url is None:

            raise HydrusExceptions.DataMissing(
                'Could not parse a download link for ' + url_base + '!')

        image_url = urlparse.urljoin(url_base, image_url)

        if 'gelbooru.com' in url_base:

            # giving 404 on some content servers for http, no redirect for some reason
            image_url = ClientNetworkingDomain.ConvertHTTPToHTTPS(image_url)

        tags = []

        for (tag_classname, namespace) in tag_classnames_to_namespaces.items():

            tag_list_entries = soup.find_all(class_=tag_classname)

            for tag_list_entry in tag_list_entries:

                links = tag_list_entry.find_all('a')

                if tag_list_entry.name == 'a':

                    links.append(tag_list_entry)

                for link in links:

                    if link.string is None:

                        continue

                    try:

                        tag_string = HydrusData.ToUnicode(link.string)

                        tag_string = HydrusTags.CleanTag(tag_string)

                        if tag_string in (
                                '?', '-', '+', u'\xe2\x80\x93', u'\u2013'
                        ):  # last two are a couple of amusing encodings of en-dash '-' from danbooru

                            continue

                        tag = HydrusTags.CombineTag(namespace, tag_string)

                        tags.append(tag)

                    except Exception as e:

                        HydrusData.Print('Could not parse tag "' +
                                         repr(link.string) + '":')

                        HydrusData.PrintException(e)

        return (image_url, tags)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseGalleryPage(self, html, url_base):

        definitely_no_more_pages = False

        urls_set = set()
        urls = []

        soup = ClientParsing.GetSoup(html)

        # this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
        def starts_with_classname(classname):

            return classname is not None and classname.startswith(
                self._thumb_classname)

        thumbnails = soup.find_all(class_=starts_with_classname)

        # this is a sankaku thing
        popular_thumbnail_parent = soup.find(id='popular-preview')

        if popular_thumbnail_parent is not None:

            popular_thumbnails = popular_thumbnail_parent.find_all(
                class_=starts_with_classname)

            thumbnails = thumbnails[len(popular_thumbnails):]

        for thumbnail in thumbnails:

            links = thumbnail.find_all('a')

            if thumbnail.name == 'a':

                links.append(thumbnail)

            for link in links:

                if link.string is not None and link.string == 'Image Only':

                    continue  # rule 34 @ paheal fix

                url = link['href']

                url = urlparse.urljoin(url_base, url)

                if url not in urls_set:

                    urls_set.add(url)
                    urls.append(url)

        if len(urls) == 0:

            definitely_no_more_pages = True

        if self._booru_name not in gallery_advance_nums:

            if len(urls) > 0:

                gallery_advance_nums[self._booru_name] = len(urls)

        if 'gelbooru.com' in url_base:

            # they now use redirect urls for thumbs, wew lad

            bad_urls = urls

            urls = []

            session = requests.Session()

            for bad_url in bad_urls:

                # the garbage after the redirect.php is the redirect in base64

                # https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY5NDEyMg==

                if 'redirect.php' in bad_url:

                    try:

                        encoded_location = bad_url.split('?s=')[1]

                        location = encoded_location.decode('base64')

                        url = urlparse.urljoin(bad_url, location)

                        urls.append(url)

                    except Exception as e:

                        HydrusData.ShowText('gelbooru parsing problem!')
                        HydrusData.ShowException(e)

                        time.sleep(2)

                        break

                else:

                    urls.append(bad_url)

            # giving 404 on some content servers for http, no redirect for some reason
            urls = [
                ClientNetworkingDomain.ConvertHTTPToHTTPS(url) for url in urls
            ]

        urls_and_tags = [(url, set()) for url in urls]

        return (urls_and_tags, definitely_no_more_pages)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: ClientDownloading.py Projeto: 3wayHimself/hydrus

    def _ParseGalleryPage(self, data, url_base):
        def ConvertRegularToRawURL(regular_url):

            # convert this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
            # to this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
            # the 500 part can be a bunch of stuff, including letters

            url_components = regular_url.split('_')

            last_component = url_components[-1]

            (number_gubbins, file_ext) = last_component.split('.')

            raw_last_component = 'raw.' + file_ext

            url_components[-1] = raw_last_component

            raw_url = '_'.join(url_components)

            return raw_url

        def Remove68Subdomain(long_url):

            # sometimes the 68 subdomain gives a 404 on the raw url, so:

            # convert this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
            # to this:
            # http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg

            # I am not sure if it is always 68, but let's not assume

            # Indeed, this is apparently now 78, wew!

            (scheme, rest) = long_url.split('://', 1)

            if rest.startswith('media.tumblr.com'):

                return long_url

            (gumpf, shorter_rest) = rest.split('.', 1)

            shorter_url = scheme + '://' + shorter_rest

            return shorter_url

        def MediaToDataSubdomain(url):

            return url.replace('media', 'data', 1)

        definitely_no_more_pages = False

        if data.startswith('<!DOCTYPE html>'):

            message = 'The tumblr downloader received an unexpected HTML page when it tried to download JSON post information. It is likely that you are an EU/EEA user and have been hit by a GDPR click-through issue.'
            message += os.linesep * 2
            message += 'In order to get the hydrus client to \'click ok\' on that page, please hit _network->logins->DEBUG: misc->do tumblr GDPR click-through_ and try this gallery search again.'
            message += os.linesep * 2
            message += 'If you still have problems, please let hydrus dev know.'

            HydrusData.ShowText(message)

            raise Exception('Tumblr GDPR issue.')

        processed_raw_json = data.split(
            'var tumblr_api_read = ')[1][:-2]  # -1 takes a js ';' off the end

        json_object = json.loads(processed_raw_json)

        urls_and_tags = []

        if 'posts' in json_object:

            for post in json_object['posts']:

                # 2012-06-20 15:59:00 GMT
                date = post['date-gmt']

                date_struct = time.strptime(date, '%Y-%m-%d %H:%M:%S %Z')

                raw_url_available = date_struct.tm_year > 2012

                if 'tags' in post:

                    tags = post['tags']

                else:

                    tags = []

                post_type = post['type']

                if post_type == 'photo':

                    if len(post['photos']) == 0:

                        photos = [post]

                    else:

                        photos = post['photos']

                    for photo in photos:

                        try:

                            url = photo['photo-url-1280']

                            # some urls are given in the form:
                            # https://68.media.tumblr.com/tumblr_m5yb5m2O6A1rso2eyo1_540.jpg
                            # which is missing the hex key in the middle
                            # these urls are unavailable as raws from the main media server
                            # these seem to all be the pre-2013 files, but we'll double-check just in case anyway
                            unusual_hexless_url = url.count('/') == 3

                            if not unusual_hexless_url:

                                if raw_url_available:

                                    url = ConvertRegularToRawURL(url)

                                    url = Remove68Subdomain(url)

                                    url = MediaToDataSubdomain(url)

                            url = ClientNetworkingDomain.ConvertHTTPToHTTPS(
                                url)

                            urls_and_tags.append((url, tags))

                        except:

                            pass

                elif post_type == 'video':

                    if 'video-player' in post:

                        video_player_html = post['video-player']

                        try:

                            vp_soup = ClientParsing.GetSoup(video_player_html)

                            vp_source = vp_soup.find('source')

                            url = vp_source['src']

                            urls_and_tags.append((url, tags))

                        except:

                            pass

        return (urls_and_tags, definitely_no_more_pages)

Exemplo n.º 22

0

Exibir arquivo

def ConvertBooruToNewObjects(booru):

    name = booru.GetName()

    name = 'zzz - auto-generated from legacy booru system - ' + name

    (search_url, search_separator, advance_by_page_num, thumb_classname,
     image_id, image_data, tag_classnames_to_namespaces) = booru.GetData()

    if advance_by_page_num:

        search_url = search_url.replace('%index%', '1')

    else:

        search_url = search_url.replace('%index%', '0')

    gug = ClientNetworkingDomain.GalleryURLGenerator(
        name + ' search',
        url_template=search_url,
        replacement_phrase='%tags%',
        search_terms_separator=search_separator,
        initial_search_text='tag search',
        example_search_text='blonde_hair blue_eyes')

    #

    tag_rules = []

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = None
    tag_attributes = {'class': thumb_classname}
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
    tag_name = 'a'
    tag_attributes = None
    tag_index = None

    tag_rules.append(
        ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                    tag_name=tag_name,
                                    tag_attributes=tag_attributes,
                                    tag_index=tag_index))

    formula = ClientParsing.ParseFormulaHTML(
        tag_rules=tag_rules,
        content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
        attribute_to_fetch='href')

    url_type = HC.URL_TYPE_DESIRED
    priority = 50

    additional_info = (url_type, priority)

    thumb_content_parser = ClientParsing.ContentParser(
        name='get post urls (based on old booru thumb search)',
        content_type=HC.CONTENT_TYPE_URLS,
        formula=formula,
        additional_info=additional_info)

    gallery_parser = ClientParsing.PageParser(
        name + ' gallery page parser',
        content_parsers=[thumb_content_parser],
        example_urls=[gug.GetExampleURL()])

    #

    content_parsers = []

    if image_id is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 75

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file link url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

        #

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'img'
        tag_attributes = {'id': image_id}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='src')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_src_content_parser = ClientParsing.ContentParser(
            name='get image file src url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_src_content_parser)

    elif image_data is not None:

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        string_match = ClientParsing.StringMatch(
            match_type=ClientParsing.STRING_MATCH_FIXED,
            match_value=image_data,
            example_string=image_data)

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index,
                                        should_test_tag_string=True,
                                        tag_string_string_match=string_match))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE,
            attribute_to_fetch='href')

        url_type = HC.URL_TYPE_DESIRED
        priority = 50

        additional_info = (url_type, priority)

        image_link_content_parser = ClientParsing.ContentParser(
            name='get image file url (based on old booru parser)',
            content_type=HC.CONTENT_TYPE_URLS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(image_link_content_parser)

    for (classname, namespace) in tag_classnames_to_namespaces.items():

        tag_rules = []

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = None
        tag_attributes = {'class': classname}
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
        tag_name = 'a'
        tag_attributes = None
        tag_index = None

        tag_rules.append(
            ClientParsing.ParseRuleHTML(rule_type=rule_type,
                                        tag_name=tag_name,
                                        tag_attributes=tag_attributes,
                                        tag_index=tag_index))

        formula = ClientParsing.ParseFormulaHTML(
            tag_rules=tag_rules,
            content_to_fetch=ClientParsing.HTML_CONTENT_STRING)

        additional_info = namespace

        tag_content_parser = ClientParsing.ContentParser(
            name='get "' + namespace + '" tags',
            content_type=HC.CONTENT_TYPE_MAPPINGS,
            formula=formula,
            additional_info=additional_info)

        content_parsers.append(tag_content_parser)

    post_parser = ClientParsing.PageParser(name + ' post page parser',
                                           content_parsers=content_parsers,
                                           example_urls=[])

    #

    return (gug, gallery_parser, post_parser)

Exemplo n.º 23

0

Exibir arquivo

 def WorkOnURL( self, gallery_seed_log, file_seed_cache, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, max_new_urls_allowed = None, gallery_urls_seen_before = None ):
     
     if gallery_urls_seen_before is None:
         
         gallery_urls_seen_before = set()
         
     
     gallery_urls_seen_before.add( self.url )
     
     # maybe something like 'append urls' vs 'reverse-prepend' for subs or something
     
     # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
     
     num_urls_added = 0
     num_urls_already_in_file_seed_cache = 0
     num_urls_total = 0
     result_404 = False
     
     try:
         
         ( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.url )
         
         if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
             
             raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' )
             
         
         if not can_parse:
             
             raise HydrusExceptions.VetoException( 'Did not have a parser for this URL!' )
             
         
         ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( self.url )
         
         status_hook( 'downloading page' )
         
         if self._referral_url not in ( self.url, url_to_check ):
             
             referral_url = self._referral_url
             
         else:
             
             referral_url = None
             
         
         network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
         
         network_job.OverrideBandwidth( 30 )
         
         HG.client_controller.network_engine.AddJob( network_job )
         
         with network_job_presentation_context_factory( network_job ) as njpc:
             
             network_job.WaitUntilDone()
             
         
         data = network_job.GetContent()
         
         parsing_context = {}
         
         parsing_context[ 'gallery_url' ] = self.url
         parsing_context[ 'url' ] = url_to_check
         
         all_parse_results = parser.Parse( parsing_context, data )
         
         if len( all_parse_results ) == 0:
             
             raise HydrusExceptions.VetoException( 'Could not parse any data!' )
             
         
         title = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
         
         if title is not None:
             
             title_hook( title )
             
         
         ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total ) = ClientImporting.UpdateFileSeedCacheWithAllParseResults( file_seed_cache, all_parse_results, self.url, max_new_urls_allowed )
         
         if max_new_urls_allowed is None:
             
             can_add_more_file_urls = True
             
         else:
             
             can_add_more_file_urls = num_urls_added < max_new_urls_allowed
             
         
         status = CC.STATUS_SUCCESSFUL_AND_NEW
         
         note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found'
         
         if num_urls_already_in_file_seed_cache > 0:
             
             note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)'
             
         
         if not can_add_more_file_urls:
             
             note += ' - hit file limit'
             
         
         # only keep searching if we found any files, otherwise this could be a blank results page with another stub page
         can_add_more_gallery_urls = num_urls_total > 0 and can_add_more_file_urls
         
         if self._can_generate_more_pages and can_add_more_gallery_urls:
             
             flattened_results = list( itertools.chain.from_iterable( all_parse_results ) )
             
             next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True )
             
             if len( next_page_urls ) > 0:
                 
                 next_page_urls = HydrusData.DedupeList( next_page_urls )
                 
                 new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
                 
                 duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls )
                 
                 num_new_next_page_urls = len( new_next_page_urls )
                 num_dupe_next_page_urls = len( duplicate_next_page_urls )
                 
                 if num_new_next_page_urls > 0:
                     
                     next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ]
                     
                     gallery_seed_log.AddGallerySeeds( next_gallery_seeds )
                     
                     gallery_urls_seen_before.update( new_next_page_urls )
                     
                     if num_dupe_next_page_urls == 0:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found'
                         
                     else:
                         
                         note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found, but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added'
                         
                     
                 else:
                     
                     note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' next gallery pages found, but they had already been visited this run and were not added'
                     
                 
             
         
         self.SetStatus( status, note = note )
         
     except HydrusExceptions.ShutdownException:
         
         pass
         
     except HydrusExceptions.VetoException as e:
         
         status = CC.STATUS_VETOED
         
         note = HydrusData.ToUnicode( e )
         
         self.SetStatus( status, note = note )
         
         if isinstance( e, HydrusExceptions.CancelledException ):
             
             status_hook( 'cancelled!' )
             
             time.sleep( 2 )
             
         
     except HydrusExceptions.ForbiddenException:
         
         status = CC.STATUS_VETOED
         note = '403'
         
         self.SetStatus( status, note = note )
         
         status_hook( '403' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except HydrusExceptions.NotFoundException:
         
         status = CC.STATUS_VETOED
         note = '404'
         
         self.SetStatus( status, note = note )
         
         status_hook( '404' )
         
         time.sleep( 2 )
         
         result_404 = True
         
     except Exception as e:
         
         status = CC.STATUS_ERROR
         
         self.SetStatus( status, exception = e )
         
         status_hook( 'error!' )
         
         time.sleep( 3 )
         
         if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever
             
             raise
             
         
     
     gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) )
     
     return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404 )

Exemplo n.º 24

0

Exibir arquivo

Arquivo: ClientNetworkingLogin.py Projeto: antonpaquin/hydrus

    def TestPixiv(self, pixiv_id, password):

        # this is just an ugly copy, but f**k it for the minute
        # we'll figure out a proper testing engine later with the login engine and tie the manage gui into it as well

        session = requests.Session()

        response = session.get('https://accounts.pixiv.net/login')

        soup = ClientParsing.GetSoup(response.content)

        # some whocking 20kb bit of json tucked inside a hidden form input wew lad
        i = soup.find('input', id='init-config')

        raw_json = i['value']

        j = json.loads(raw_json)

        if 'pixivAccount.postKey' not in j:

            return (
                False,
                'When trying to log into Pixiv, I could not find the POST key! This is a problem with hydrus\'s pixiv parsing, not your login! Please contact hydrus dev!'
            )

        post_key = j['pixivAccount.postKey']

        form_fields = {}

        form_fields['pixiv_id'] = pixiv_id
        form_fields['password'] = password
        form_fields['captcha'] = ''
        form_fields['g_recaptcha_response'] = ''
        form_fields['return_to'] = 'https://www.pixiv.net'
        form_fields['lang'] = 'en'
        form_fields['post_key'] = post_key
        form_fields['source'] = 'pc'

        headers = {}

        headers[
            'referer'] = "https://accounts.pixiv.net/login?lang=en^source=pc&view_type=page&ref=wwwtop_accounts_index"
        headers['origin'] = "https://accounts.pixiv.net"

        r = session.post('https://accounts.pixiv.net/api/login?lang=en',
                         data=form_fields,
                         headers=headers)

        if not r.ok:

            HydrusData.ShowText(r.content)

            return (False, 'Login request failed! Info printed to log.')

        cookies = session.cookies

        cookies.clear_expired_cookies()

        domains = cookies.list_domains()

        for domain in domains:

            if domain.endswith('pixiv.net'):

                d = cookies.get_dict(domain)

                if 'PHPSESSID' not in d:

                    HydrusData.ShowText(r.content)

                    return (
                        False,
                        'Pixiv login failed to establish session! Info printed to log.'
                    )

                return (True, '')

        HydrusData.ShowText(r.content)

        return (
            False,
            'Pixiv login failed to establish session! Info printed to log.')