def __init__( self, name, url_type=None, preferred_scheme='https', netloc='hostname.com', allow_subdomains=False, keep_subdomains=False, path_components=None, parameters=None, example_url='https://hostname.com/post/page.php?id=123456&s=view'): if url_type is None: url_type = HC.URL_TYPE_POST if path_components is None: path_components = HydrusSerialisable.SerialisableList() path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='post', example_string='post')) path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='page.php', example_string='page.php')) if parameters is None: parameters = HydrusSerialisable.SerialisableDictionary() parameters['s'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='view', example_string='view') parameters['id'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC, example_string='123456') # if the args are not serialisable stuff, lets overwrite here path_components = HydrusSerialisable.SerialisableList(path_components) parameters = HydrusSerialisable.SerialisableDictionary(parameters) HydrusSerialisable.SerialisableBaseNamed.__init__(self, name) self._url_type = url_type self._preferred_scheme = preferred_scheme self._netloc = netloc self._allow_subdomains = allow_subdomains self._keep_subdomains = keep_subdomains self._path_components = path_components self._parameters = parameters self._example_url = example_url
def __init__( self, name, preferred_scheme='https', netloc='hostname.com', subdomain_is_important=False, path_components=None, parameters=None, example_url='https://hostname.com/post/page.php?id=123456&s=view'): if path_components is None: path_components = HydrusSerialisable.SerialisableList() path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='post', example_string='post')) path_components.append( ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='page.php', example_string='page.php')) if parameters is None: parameters = HydrusSerialisable.SerialisableDictionary() parameters['s'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value='view', example_string='view') parameters['id'] = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FLEXIBLE, match_value=ClientParsing.NUMERIC, example_string='123456') # an edit dialog panel for this that has example url and testing of current values # a parent panel or something that lists all current urls in the db that match and how they will be clipped, is this ok? kind of thing. HydrusSerialisable.SerialisableBaseNamed.__init__(self, name) self._preferred_scheme = preferred_scheme self._netloc = netloc self._subdomain_is_important = subdomain_is_important self._path_components = path_components self._parameters = parameters self._example_url = example_url
def _ParseGalleryPage(self, html, url_base): definitely_no_more_pages = False urls = [] soup = ClientParsing.GetSoup(html) thumbnail_links = soup.find_all(class_='work') thumbnail_urls = [ thumbnail_link['href'] for thumbnail_link in thumbnail_links ] for thumbnail_url in thumbnail_urls: url = urlparse.urljoin( url_base, thumbnail_url ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690 urls.append(url) urls_and_tags = [(url, set()) for url in urls] return (urls_and_tags, definitely_no_more_pages)
def ConvertAllParseResultsToFileSeeds(all_parse_results, source_url, file_import_options): file_seeds = [] seen_urls = set() for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, (HC.URL_TYPE_DESIRED, ), only_get_top_priority=True) parsed_urls = HydrusData.DedupeList(parsed_urls) parsed_urls = [url for url in parsed_urls if url not in seen_urls] seen_urls.update(parsed_urls) # note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up for url in parsed_urls: file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url) file_seed.SetReferralURL(source_url) file_seed.AddParseResults(parse_results, file_import_options) file_seeds.append(file_seed) return file_seeds
def _ParseImagePage( self, html, url_base ): soup = ClientParsing.GetSoup( html ) tags = set() author_links = soup.find( 'ul', class_ = 'authorlinks' ) if author_links is not None: authors = set() links = author_links.find_all( 'a' ) for link in links: try: href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' ) tags.add( u'creator:' + creator ) except: pass try: title = soup.find( 'title' ) tags.add( u'title:' + title.string ) except: pass all_links = soup.find_all( 'a' ) for link in all_links: try: href = link[ 'href' ] if '/browse/tag/' in href: tags.add( link.string ) except: pass # flash_url = html.split( '"http:\/\/uploads.ungrounded.net\/', 1 )[1] flash_url = flash_url.split( '"', 1 )[0] flash_url = flash_url.replace( "\/", '/' ) flash_url = 'http://uploads.ungrounded.net/' + flash_url return ( flash_url, tags )
def _UpdateSerialisableInfo(self, version, old_serialisable_info): if version == 1: (url_type, preferred_scheme, netloc, allow_subdomains, keep_subdomains, serialisable_path_components, serialisable_parameters, example_url) = old_serialisable_info url_match_key = HydrusData.GenerateKey() serialisable_url_match_key = url_match_key.encode('hex') api_lookup_converter = ClientParsing.StringConverter( example_string=example_url) serialisable_api_lookup_converter = api_lookup_converter.GetSerialisableTuple( ) new_serialisable_info = (serialisable_url_match_key, url_type, preferred_scheme, netloc, allow_subdomains, keep_subdomains, serialisable_path_components, serialisable_parameters, serialisable_api_lookup_converter, example_url) return (2, new_serialisable_info)
def LoginTumblrGDPR( self ): # t-thanks, EU # this is cribbed from poking around here https://github.com/johanneszab/TumblThree/commit/3563d6cebf1a467151d6b8d6eee9806ddd6e6364 network_job = ClientNetworkingJobs.NetworkJob( 'GET', 'http://www.tumblr.com/' ) network_job.SetForLogin( True ) self.engine.AddJob( network_job ) network_job.WaitUntilDone() html = network_job.GetContent() formula = ClientParsing.ParseFormulaHTML( tag_rules = [ ClientParsing.ParseRuleHTML( rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING, tag_name = 'meta', tag_attributes = { 'id' : 'tumblr_form_key' } ) ], content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = "content" ) results = formula.Parse( {}, html ) if len( results ) != 1: raise HydrusExceptions.ParseException( 'Could not figure out the tumblr form key for the GDPR click-through.' ) tumblr_form_key = results[0] # body = '{\"eu_resident\":true,\"gdpr_is_acceptable_age\":true,\"gdpr_consent_core\":true,\"gdpr_consent_first_party_ads\":true,\"gdpr_consent_third_party_ads\":true,\"gdpr_consent_search_history\":true,\"redirect_to\":\"\"}' referral_url = 'https://www.tumblr.com/privacy/consent?redirect=' network_job = ClientNetworkingJobs.NetworkJob( 'POST', 'https://www.tumblr.com/svc/privacy/consent', body = body, referral_url = referral_url ) network_job.SetForLogin( True ) network_job.AddAdditionalHeader( 'Accept', 'application/json, text/javascript, */*; q=0.01') network_job.AddAdditionalHeader( 'Content-Type', 'application/json' ) network_job.AddAdditionalHeader( 'X-Requested-With', 'XMLHttpRequest' ) network_job.AddAdditionalHeader( 'X-tumblr-form-key', tumblr_form_key ) self.engine.AddJob( network_job ) network_job.WaitUntilDone() # test cookies here or something HydrusData.ShowText( 'Looks like tumblr GDPR click-through worked! You should be good for a year, at which point we should have an automatic solution for this!' )
def THREADFetchTags(self, script, job_key, file_identifier, desired_content): content_results = script.DoQuery(job_key, file_identifier, desired_content) tags = ClientParsing.GetTagsFromContentResults(content_results) wx.CallAfter(self._SetTags, tags)
def _ParseGalleryPage( self, html, url_base ): definitely_no_more_pages = False urls_set = set() soup = ClientParsing.GetSoup( html ) def correct_url( href ): if href is None: return False # a good url is in the form "/pictures/user/artist_name/file_id/title" if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ): ( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' ) # /pictures/user/artist_name/page/3 if file_id != 'page': return True return False urls = [] links = soup.find_all( 'a', href = correct_url ) for link in links: url = 'http://www.hentai-foundry.com' + link['href'] if url not in urls_set: urls_set.add( url ) urls.append( url ) # this is copied from old code. surely we can improve it? if 'class="next"' not in html: definitely_no_more_pages = True urls_and_tags = [ ( url, set() ) for url in urls ] return ( urls_and_tags, definitely_no_more_pages )
def _ParseImagePage(self, html, page_url): if 'member_illust.php?mode=manga' in html: manga_url = page_url.replace('medium', 'manga') raise HydrusExceptions.MimeException( page_url + ' was manga, not a single image, so could not be downloaded.') if 'member_illust.php?mode=ugoira_view' in html: raise HydrusExceptions.MimeException( page_url + ' was ugoira, not a single image, so could not be downloaded.') soup = ClientParsing.GetSoup(html) # original_image = soup.find(class_='original-image') image_url = original_image[ 'data-src'] # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg # tags_parent = soup.find('section', class_='work-tags') # <a href="/search.php?s_mode=s_tag_full&word=%E3%83%8F%E3%83%B3%E3%83%89%E3%83%A1%E3%82%A4%E3%83%89" class="text">[unicode tag here]</a> tags = [ link.string for link in tags_parent.find_all('a', class_='text') ] user = soup.find('h1', class_='user') if user is not None: tags.append('creator:' + user.string) title_parent = soup.find('section', class_=re.compile('work-info')) if title_parent is not None: title = title_parent.find('h1', class_='title') if title is not None: tags.append('title:' + title.string) return (image_url, tags)
def THREADFetchTags( self, script, job_key, file_identifier ): def wx_code( tags ): if not self: return self._SetTags( tags ) parse_results = script.DoQuery( job_key, file_identifier ) tags = ClientParsing.GetTagsFromParseResults( parse_results ) wx.CallAfter( wx_code, tags )
def UpdateFileSeedCacheWithAllParseResults(file_seed_cache, all_parse_results, source_url, max_new_urls_allowed=None): new_file_seeds = [] num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 for parse_results in all_parse_results: parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, (HC.URL_TYPE_DESIRED, ), only_get_top_priority=True) for url in parsed_urls: num_urls_total += 1 if max_new_urls_allowed is not None and num_urls_added == max_new_urls_allowed: continue file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url) file_seed.SetReferralURL(source_url) if file_seed_cache.HasFileSeed(file_seed): num_urls_already_in_file_seed_cache += 1 else: num_urls_added += 1 file_seed.AddParseResults(parse_results) new_file_seeds.append(file_seed) file_seed_cache.AddFileSeeds(new_file_seeds) return (num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total)
def LoginPixiv(self, network_context, pixiv_id, password): session = self.engine.session_manager.GetSession(network_context) response = session.get('https://accounts.pixiv.net/login') soup = ClientParsing.GetSoup(response.content) # some whocking 20kb bit of json tucked inside a hidden form input wew lad i = soup.find('input', id='init-config') raw_json = i['value'] j = json.loads(raw_json) if 'pixivAccount.postKey' not in j: raise HydrusExceptions.ForbiddenException( 'When trying to log into Pixiv, I could not find the POST key! This is a problem with hydrus\'s pixiv parsing, not your login! Please contact hydrus dev!' ) post_key = j['pixivAccount.postKey'] form_fields = {} form_fields['pixiv_id'] = pixiv_id form_fields['password'] = password form_fields['captcha'] = '' form_fields['g_recaptcha_response'] = '' form_fields['return_to'] = 'https://www.pixiv.net' form_fields['lang'] = 'en' form_fields['post_key'] = post_key form_fields['source'] = 'pc' headers = {} headers[ 'referer'] = "https://accounts.pixiv.net/login?lang=en^source=pc&view_type=page&ref=wwwtop_accounts_index" headers['origin'] = "https://accounts.pixiv.net" session.post('https://accounts.pixiv.net/api/login?lang=en', data=form_fields, headers=headers) time.sleep(1)
def Parse4chanPostScreen(html): soup = ClientParsing.GetSoup(html) title_tag = soup.find('title') if title_tag.string == 'Post successful!': return ('success', None) elif title_tag.string == '4chan - Banned': HydrusData.Print(soup) text = 'You are banned from this board! html written to log.' HydrusData.ShowText(text) return ('big error', text) else: try: problem_tag = soup.find(id='errmsg') if problem_tag is None: HydrusData.Print(soup) text = 'Unknown problem; html written to log.' HydrusData.ShowText(text) return ('error', text) problem = HydrusData.ToUnicode(problem_tag) if 'CAPTCHA' in problem: return ('captcha', None) elif 'seconds' in problem: return ('too quick', None) elif 'Duplicate' in problem: return ('error', 'duplicate file detected') else: return ('error', problem) except: return ('error', 'unknown error')
def _ParseGalleryPage( self, html, url_base ): soup = ClientParsing.GetSoup( html ) fatcol = soup.find( 'div', class_ = 'fatcol' ) if fatcol is not None: links = fatcol.find_all( 'a' ) else: links = [] urls_set = set() urls = [] for link in links: try: url = link[ 'href' ] if url not in urls_set: if url.startswith( 'http://www.newgrounds.com/portal/view/' ): urls_set.add( url ) urls.append( url ) except: pass definitely_no_more_pages = True urls_and_tags = [ ( url, set() ) for url in urls ] return ( urls_and_tags, definitely_no_more_pages )
def _ParseGalleryPage(self, html, url_base): definitely_no_more_pages = False urls_and_tags = [] soup = ClientParsing.GetSoup(html) thumbs_container = soup.find('div', class_='torpedo-container') thumbs = thumbs_container.find_all('span', class_='thumb') for thumb in thumbs: url = thumb[ 'href'] # something in the form of blah.da.com/art/blah-123456 tags = [] urls_and_tags.append((url, tags)) return (urls_and_tags, definitely_no_more_pages)
def _ParseImagePage(self, html, url_base): # can't parse this easily normally because HF is a pain with the preview->click to see full size business. # find http://pictures.hentai-foundry.com// # then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144/image.jpg # the .jpg bit is what we really need, but whatever # an example of this: # http://www.hentai-foundry.com/pictures/user/Sparrow/440257/Meroulix-LeBeau # addendum: # some users put pictures.hentai-foundry.com links in their profile images, which then gets repeated up above in some <meta> tag # so, lets limit this search to a smaller bit of html # example of this: # http://www.hentai-foundry.com/pictures/user/teku/572881/Special-Gang-Bang try: image_soup = ClientParsing.GetSoup(html) image_html = unicode(image_soup.find('section', id='picBox')) index = image_html.index('pictures.hentai-foundry.com') image_url = image_html[index:index + 256] if '"' in image_url: (image_url, gumpf) = image_url.split('"', 1) if ''' in image_url: (image_url, gumpf) = image_url.split(''', 1) image_url = 'http://' + image_url except Exception as e: raise Exception('Could not parse image url!' + os.linesep + HydrusData.ToUnicode(e)) soup = ClientParsing.GetSoup(html) tags = [] try: title = soup.find('title') (data, nothing) = title.string.split(' - Hentai Foundry') data_reversed = data[:: -1] # want to do it right-side first, because title might have ' by ' in it (artist_reversed, title_reversed) = data_reversed.split(' yb ') artist = artist_reversed[::-1] title = title_reversed[::-1] tags.append('creator:' + artist) tags.append('title:' + title) except: pass return (image_url, tags)
def _ParseImagePage(self, html, referral_url): img_url = None soup = ClientParsing.GetSoup(html) download_button = soup.find('a', class_='dev-page-download') if download_button is None: # this method maxes out at 1024 width img = soup.find(class_='dev-content-full') if img is None: # nsfw # used to fetch this from a tumblr share url, now we grab from some hidden gubbins behind an age gate a_ismatures = soup.find_all('a', class_='ismature') imgs = [] for a_ismature in a_ismatures: imgs.extend(a_ismature.find_all('img')) for img in imgs: # <img width="150" height="75" alt="Jelly gals by ArtInCase" src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" data-src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" srcset="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w" sizes="150px"> if img.has_attr('srcset'): # http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w # the last url here is what we want srcset = img['srcset'] # 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg gubbins_and_url = srcset.split(' ')[-2] img_url = gubbins_and_url.split(',')[1] break else: img_url = img['src'] else: # something like http://www.deviantart.com/download/518046750/varda_and_the_sacred_trees_of_valinor_by_implosinoatic-d8kfjfi.jpg?token=476cb73aa2ab22bb8554542bc9f14982e09bd534&ts=1445717843 # given the right cookies, it redirects to the truly fullsize image_url # otherwise, it seems to redirect to a small interstitial redirect page that heads back to the original image page img_url = download_button['href'] if img_url is None: raise HydrusExceptions.ParseException( 'Could not find a download link--maybe this work was text?') return img_url
def _ParseImagePage(self, html, url_base): (search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces) = self._booru.GetData() soup = ClientParsing.GetSoup(html) image_url = None try: if image_id is not None: image = soup.find(id=image_id) if image is None: image_string = soup.find(text=re.compile('Save this file')) if image_string is None: image_string = soup.find( text=re.compile('Save this video')) if image_string is None: # catchall for rule34hentai.net's webms if image_url is None: a_tags = soup.find_all('a') for a_tag in a_tags: href = a_tag['href'] if href is not None: if href.endswith('.webm'): image_url = href break # catchall for rule34hentai.net's mp4s, which are loaded in a mickey-mouse flv player if image_url is None: magic_phrase = 'document.write("<source src=\'' if magic_phrase in html: # /image/252605' type='video/mp4... image_url_and_gumpf = html.split( magic_phrase, 1)[1] image_url = image_url_and_gumpf.split('\'', 1)[0] else: image = image_string.parent image_url = image['href'] else: if image.name in ('img', 'video'): image_url = image['src'] if 'Running Danbooru' in html: # possible danbooru resized image possible_better_image = soup.find( id='image-resize-link') if possible_better_image is not None: image_url = possible_better_image['href'] elif image.name == 'a': image_url = image['href'] if image_data is not None: links = soup.find_all('a') ok_link = None better_link = None for link in links: if link.string is not None: if link.string.startswith( image_data) or link.string.endswith( image_data): ok_link = link['href'] if link.string.startswith('Download PNG'): better_link = link['href'] break if better_link is not None: image_url = better_link else: image_url = ok_link except Exception as e: raise HydrusExceptions.DataMissing( 'Could not parse a download link for ' + url_base + '!' + os.linesep + HydrusData.ToUnicode(e)) if image_url is None: raise HydrusExceptions.DataMissing( 'Could not parse a download link for ' + url_base + '!') image_url = urlparse.urljoin(url_base, image_url) if 'gelbooru.com' in url_base: # giving 404 on some content servers for http, no redirect for some reason image_url = ClientNetworkingDomain.ConvertHTTPToHTTPS(image_url) tags = [] for (tag_classname, namespace) in tag_classnames_to_namespaces.items(): tag_list_entries = soup.find_all(class_=tag_classname) for tag_list_entry in tag_list_entries: links = tag_list_entry.find_all('a') if tag_list_entry.name == 'a': links.append(tag_list_entry) for link in links: if link.string is None: continue try: tag_string = HydrusData.ToUnicode(link.string) tag_string = HydrusTags.CleanTag(tag_string) if tag_string in ( '?', '-', '+', u'\xe2\x80\x93', u'\u2013' ): # last two are a couple of amusing encodings of en-dash '-' from danbooru continue tag = HydrusTags.CombineTag(namespace, tag_string) tags.append(tag) except Exception as e: HydrusData.Print('Could not parse tag "' + repr(link.string) + '":') HydrusData.PrintException(e) return (image_url, tags)
def _ParseGalleryPage(self, html, url_base): definitely_no_more_pages = False urls_set = set() urls = [] soup = ClientParsing.GetSoup(html) # this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit def starts_with_classname(classname): return classname is not None and classname.startswith( self._thumb_classname) thumbnails = soup.find_all(class_=starts_with_classname) # this is a sankaku thing popular_thumbnail_parent = soup.find(id='popular-preview') if popular_thumbnail_parent is not None: popular_thumbnails = popular_thumbnail_parent.find_all( class_=starts_with_classname) thumbnails = thumbnails[len(popular_thumbnails):] for thumbnail in thumbnails: links = thumbnail.find_all('a') if thumbnail.name == 'a': links.append(thumbnail) for link in links: if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix url = link['href'] url = urlparse.urljoin(url_base, url) if url not in urls_set: urls_set.add(url) urls.append(url) if len(urls) == 0: definitely_no_more_pages = True if self._booru_name not in gallery_advance_nums: if len(urls) > 0: gallery_advance_nums[self._booru_name] = len(urls) if 'gelbooru.com' in url_base: # they now use redirect urls for thumbs, wew lad bad_urls = urls urls = [] session = requests.Session() for bad_url in bad_urls: # the garbage after the redirect.php is the redirect in base64 # https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY5NDEyMg== if 'redirect.php' in bad_url: try: encoded_location = bad_url.split('?s=')[1] location = encoded_location.decode('base64') url = urlparse.urljoin(bad_url, location) urls.append(url) except Exception as e: HydrusData.ShowText('gelbooru parsing problem!') HydrusData.ShowException(e) time.sleep(2) break else: urls.append(bad_url) # giving 404 on some content servers for http, no redirect for some reason urls = [ ClientNetworkingDomain.ConvertHTTPToHTTPS(url) for url in urls ] urls_and_tags = [(url, set()) for url in urls] return (urls_and_tags, definitely_no_more_pages)
def _ParseGalleryPage(self, data, url_base): def ConvertRegularToRawURL(regular_url): # convert this: # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg # to this: # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg # the 500 part can be a bunch of stuff, including letters url_components = regular_url.split('_') last_component = url_components[-1] (number_gubbins, file_ext) = last_component.split('.') raw_last_component = 'raw.' + file_ext url_components[-1] = raw_last_component raw_url = '_'.join(url_components) return raw_url def Remove68Subdomain(long_url): # sometimes the 68 subdomain gives a 404 on the raw url, so: # convert this: # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg # to this: # http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg # I am not sure if it is always 68, but let's not assume # Indeed, this is apparently now 78, wew! (scheme, rest) = long_url.split('://', 1) if rest.startswith('media.tumblr.com'): return long_url (gumpf, shorter_rest) = rest.split('.', 1) shorter_url = scheme + '://' + shorter_rest return shorter_url def MediaToDataSubdomain(url): return url.replace('media', 'data', 1) definitely_no_more_pages = False if data.startswith('<!DOCTYPE html>'): message = 'The tumblr downloader received an unexpected HTML page when it tried to download JSON post information. It is likely that you are an EU/EEA user and have been hit by a GDPR click-through issue.' message += os.linesep * 2 message += 'In order to get the hydrus client to \'click ok\' on that page, please hit _network->logins->DEBUG: misc->do tumblr GDPR click-through_ and try this gallery search again.' message += os.linesep * 2 message += 'If you still have problems, please let hydrus dev know.' HydrusData.ShowText(message) raise Exception('Tumblr GDPR issue.') processed_raw_json = data.split( 'var tumblr_api_read = ')[1][:-2] # -1 takes a js ';' off the end json_object = json.loads(processed_raw_json) urls_and_tags = [] if 'posts' in json_object: for post in json_object['posts']: # 2012-06-20 15:59:00 GMT date = post['date-gmt'] date_struct = time.strptime(date, '%Y-%m-%d %H:%M:%S %Z') raw_url_available = date_struct.tm_year > 2012 if 'tags' in post: tags = post['tags'] else: tags = [] post_type = post['type'] if post_type == 'photo': if len(post['photos']) == 0: photos = [post] else: photos = post['photos'] for photo in photos: try: url = photo['photo-url-1280'] # some urls are given in the form: # https://68.media.tumblr.com/tumblr_m5yb5m2O6A1rso2eyo1_540.jpg # which is missing the hex key in the middle # these urls are unavailable as raws from the main media server # these seem to all be the pre-2013 files, but we'll double-check just in case anyway unusual_hexless_url = url.count('/') == 3 if not unusual_hexless_url: if raw_url_available: url = ConvertRegularToRawURL(url) url = Remove68Subdomain(url) url = MediaToDataSubdomain(url) url = ClientNetworkingDomain.ConvertHTTPToHTTPS( url) urls_and_tags.append((url, tags)) except: pass elif post_type == 'video': if 'video-player' in post: video_player_html = post['video-player'] try: vp_soup = ClientParsing.GetSoup(video_player_html) vp_source = vp_soup.find('source') url = vp_source['src'] urls_and_tags.append((url, tags)) except: pass return (urls_and_tags, definitely_no_more_pages)
def ConvertBooruToNewObjects(booru): name = booru.GetName() name = 'zzz - auto-generated from legacy booru system - ' + name (search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces) = booru.GetData() if advance_by_page_num: search_url = search_url.replace('%index%', '1') else: search_url = search_url.replace('%index%', '0') gug = ClientNetworkingDomain.GalleryURLGenerator( name + ' search', url_template=search_url, replacement_phrase='%tags%', search_terms_separator=search_separator, initial_search_text='tag search', example_search_text='blonde_hair blue_eyes') # tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = None tag_attributes = {'class': thumb_classname} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) thumb_content_parser = ClientParsing.ContentParser( name='get post urls (based on old booru thumb search)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) gallery_parser = ClientParsing.PageParser( name + ' gallery page parser', content_parsers=[thumb_content_parser], example_urls=[gug.GetExampleURL()]) # content_parsers = [] if image_id is not None: tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = {'id': image_id} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 75 additional_info = (url_type, priority) image_link_content_parser = ClientParsing.ContentParser( name='get image file link url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_link_content_parser) # tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'img' tag_attributes = {'id': image_id} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='src') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) image_src_content_parser = ClientParsing.ContentParser( name='get image file src url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_src_content_parser) elif image_data is not None: tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None string_match = ClientParsing.StringMatch( match_type=ClientParsing.STRING_MATCH_FIXED, match_value=image_data, example_string=image_data) tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index, should_test_tag_string=True, tag_string_string_match=string_match)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch='href') url_type = HC.URL_TYPE_DESIRED priority = 50 additional_info = (url_type, priority) image_link_content_parser = ClientParsing.ContentParser( name='get image file url (based on old booru parser)', content_type=HC.CONTENT_TYPE_URLS, formula=formula, additional_info=additional_info) content_parsers.append(image_link_content_parser) for (classname, namespace) in tag_classnames_to_namespaces.items(): tag_rules = [] rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = None tag_attributes = {'class': classname} tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING tag_name = 'a' tag_attributes = None tag_index = None tag_rules.append( ClientParsing.ParseRuleHTML(rule_type=rule_type, tag_name=tag_name, tag_attributes=tag_attributes, tag_index=tag_index)) formula = ClientParsing.ParseFormulaHTML( tag_rules=tag_rules, content_to_fetch=ClientParsing.HTML_CONTENT_STRING) additional_info = namespace tag_content_parser = ClientParsing.ContentParser( name='get "' + namespace + '" tags', content_type=HC.CONTENT_TYPE_MAPPINGS, formula=formula, additional_info=additional_info) content_parsers.append(tag_content_parser) post_parser = ClientParsing.PageParser(name + ' post page parser', content_parsers=content_parsers, example_urls=[]) # return (gug, gallery_parser, post_parser)
def WorkOnURL( self, gallery_seed_log, file_seed_cache, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, max_new_urls_allowed = None, gallery_urls_seen_before = None ): if gallery_urls_seen_before is None: gallery_urls_seen_before = set() gallery_urls_seen_before.add( self.url ) # maybe something like 'append urls' vs 'reverse-prepend' for subs or something # should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop num_urls_added = 0 num_urls_already_in_file_seed_cache = 0 num_urls_total = 0 result_404 = False try: ( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.url ) if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ): raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' ) if not can_parse: raise HydrusExceptions.VetoException( 'Did not have a parser for this URL!' ) ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( self.url ) status_hook( 'downloading page' ) if self._referral_url not in ( self.url, url_to_check ): referral_url = self._referral_url else: referral_url = None network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url ) network_job.OverrideBandwidth( 30 ) HG.client_controller.network_engine.AddJob( network_job ) with network_job_presentation_context_factory( network_job ) as njpc: network_job.WaitUntilDone() data = network_job.GetContent() parsing_context = {} parsing_context[ 'gallery_url' ] = self.url parsing_context[ 'url' ] = url_to_check all_parse_results = parser.Parse( parsing_context, data ) if len( all_parse_results ) == 0: raise HydrusExceptions.VetoException( 'Could not parse any data!' ) title = ClientParsing.GetTitleFromAllParseResults( all_parse_results ) if title is not None: title_hook( title ) ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total ) = ClientImporting.UpdateFileSeedCacheWithAllParseResults( file_seed_cache, all_parse_results, self.url, max_new_urls_allowed ) if max_new_urls_allowed is None: can_add_more_file_urls = True else: can_add_more_file_urls = num_urls_added < max_new_urls_allowed status = CC.STATUS_SUCCESSFUL_AND_NEW note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found' if num_urls_already_in_file_seed_cache > 0: note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)' if not can_add_more_file_urls: note += ' - hit file limit' # only keep searching if we found any files, otherwise this could be a blank results page with another stub page can_add_more_gallery_urls = num_urls_total > 0 and can_add_more_file_urls if self._can_generate_more_pages and can_add_more_gallery_urls: flattened_results = list( itertools.chain.from_iterable( all_parse_results ) ) next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True ) if len( next_page_urls ) > 0: next_page_urls = HydrusData.DedupeList( next_page_urls ) new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ] duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls ) num_new_next_page_urls = len( new_next_page_urls ) num_dupe_next_page_urls = len( duplicate_next_page_urls ) if num_new_next_page_urls > 0: next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ] gallery_seed_log.AddGallerySeeds( next_gallery_seeds ) gallery_urls_seen_before.update( new_next_page_urls ) if num_dupe_next_page_urls == 0: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found' else: note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + ' next gallery pages found, but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added' else: note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' next gallery pages found, but they had already been visited this run and were not added' self.SetStatus( status, note = note ) except HydrusExceptions.ShutdownException: pass except HydrusExceptions.VetoException as e: status = CC.STATUS_VETOED note = HydrusData.ToUnicode( e ) self.SetStatus( status, note = note ) if isinstance( e, HydrusExceptions.CancelledException ): status_hook( 'cancelled!' ) time.sleep( 2 ) except HydrusExceptions.ForbiddenException: status = CC.STATUS_VETOED note = '403' self.SetStatus( status, note = note ) status_hook( '403' ) time.sleep( 2 ) result_404 = True except HydrusExceptions.NotFoundException: status = CC.STATUS_VETOED note = '404' self.SetStatus( status, note = note ) status_hook( '404' ) time.sleep( 2 ) result_404 = True except Exception as e: status = CC.STATUS_ERROR self.SetStatus( status, exception = e ) status_hook( 'error!' ) time.sleep( 3 ) if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever raise gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) ) return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404 )
def TestPixiv(self, pixiv_id, password): # this is just an ugly copy, but f**k it for the minute # we'll figure out a proper testing engine later with the login engine and tie the manage gui into it as well session = requests.Session() response = session.get('https://accounts.pixiv.net/login') soup = ClientParsing.GetSoup(response.content) # some whocking 20kb bit of json tucked inside a hidden form input wew lad i = soup.find('input', id='init-config') raw_json = i['value'] j = json.loads(raw_json) if 'pixivAccount.postKey' not in j: return ( False, 'When trying to log into Pixiv, I could not find the POST key! This is a problem with hydrus\'s pixiv parsing, not your login! Please contact hydrus dev!' ) post_key = j['pixivAccount.postKey'] form_fields = {} form_fields['pixiv_id'] = pixiv_id form_fields['password'] = password form_fields['captcha'] = '' form_fields['g_recaptcha_response'] = '' form_fields['return_to'] = 'https://www.pixiv.net' form_fields['lang'] = 'en' form_fields['post_key'] = post_key form_fields['source'] = 'pc' headers = {} headers[ 'referer'] = "https://accounts.pixiv.net/login?lang=en^source=pc&view_type=page&ref=wwwtop_accounts_index" headers['origin'] = "https://accounts.pixiv.net" r = session.post('https://accounts.pixiv.net/api/login?lang=en', data=form_fields, headers=headers) if not r.ok: HydrusData.ShowText(r.content) return (False, 'Login request failed! Info printed to log.') cookies = session.cookies cookies.clear_expired_cookies() domains = cookies.list_domains() for domain in domains: if domain.endswith('pixiv.net'): d = cookies.get_dict(domain) if 'PHPSESSID' not in d: HydrusData.ShowText(r.content) return ( False, 'Pixiv login failed to establish session! Info printed to log.' ) return (True, '') HydrusData.ShowText(r.content) return ( False, 'Pixiv login failed to establish session! Info printed to log.')