def LoginTumblrGDPR( self ): # t-thanks, EU # this is cribbed from poking around here https://github.com/johanneszab/TumblThree/commit/3563d6cebf1a467151d6b8d6eee9806ddd6e6364 network_job = ClientNetworkingJobs.NetworkJob( 'GET', 'http://www.tumblr.com/' ) network_job.SetForLogin( True ) self.engine.AddJob( network_job ) network_job.WaitUntilDone() html = network_job.GetContent() formula = ClientParsing.ParseFormulaHTML( tag_rules = [ ClientParsing.ParseRuleHTML( rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING, tag_name = 'meta', tag_attributes = { 'id' : 'tumblr_form_key' } ) ], content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = "content" ) results = formula.Parse( {}, html ) if len( results ) != 1: raise HydrusExceptions.ParseException( 'Could not figure out the tumblr form key for the GDPR click-through.' ) tumblr_form_key = results[0] # body = '{\"eu_resident\":true,\"gdpr_is_acceptable_age\":true,\"gdpr_consent_core\":true,\"gdpr_consent_first_party_ads\":true,\"gdpr_consent_third_party_ads\":true,\"gdpr_consent_search_history\":true,\"redirect_to\":\"\"}' referral_url = 'https://www.tumblr.com/privacy/consent?redirect=' network_job = ClientNetworkingJobs.NetworkJob( 'POST', 'https://www.tumblr.com/svc/privacy/consent', body = body, referral_url = referral_url ) network_job.SetForLogin( True ) network_job.AddAdditionalHeader( 'Accept', 'application/json, text/javascript, */*; q=0.01') network_job.AddAdditionalHeader( 'Content-Type', 'application/json' ) network_job.AddAdditionalHeader( 'X-Requested-With', 'XMLHttpRequest' ) network_job.AddAdditionalHeader( 'X-tumblr-form-key', tumblr_form_key ) self.engine.AddJob( network_job ) network_job.WaitUntilDone() # test cookies here or something HydrusData.ShowText( 'Looks like tumblr GDPR click-through worked! You should be good for a year, at which point we should have an automatic solution for this!' )
def _ParseContent( self, root ): if self._content_rule is None: result = root.string else: if root.has_attr( self._content_rule ): unknown_attr_result = root[ self._content_rule ] # 'class' attr returns a list because it has multiple values under html spec, wew if isinstance( unknown_attr_result, list ): if len( unknown_attr_result ) == 0: result = None else: result = ' '.join( unknown_attr_result ) else: result = unknown_attr_result else: result = None if result is None or result == '': raise HydrusExceptions.ParseException( 'No results found!' ) else: self._string_match.Test( result ) return self._string_converter.Convert( result )
def _ParseImagePage(self, html, referral_url): img_url = None soup = ClientParsing.GetSoup(html) download_button = soup.find('a', class_='dev-page-download') if download_button is None: # this method maxes out at 1024 width img = soup.find(class_='dev-content-full') if img is None: # nsfw # used to fetch this from a tumblr share url, now we grab from some hidden gubbins behind an age gate a_ismatures = soup.find_all('a', class_='ismature') imgs = [] for a_ismature in a_ismatures: imgs.extend(a_ismature.find_all('img')) for img in imgs: # <img width="150" height="75" alt="Jelly gals by ArtInCase" src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" data-src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" srcset="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w" sizes="150px"> if img.has_attr('srcset'): # http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w # the last url here is what we want srcset = img['srcset'] # 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg gubbins_and_url = srcset.split(' ')[-2] img_url = gubbins_and_url.split(',')[1] break else: img_url = img['src'] else: # something like http://www.deviantart.com/download/518046750/varda_and_the_sacred_trees_of_valinor_by_implosinoatic-d8kfjfi.jpg?token=476cb73aa2ab22bb8554542bc9f14982e09bd534&ts=1445717843 # given the right cookies, it redirects to the truly fullsize image_url # otherwise, it seems to redirect to a small interstitial redirect page that heads back to the original image page img_url = download_button['href'] if img_url is None: raise HydrusExceptions.ParseException( 'Could not find a download link--maybe this work was text?') return img_url