def get_for_author(self, author): logger.info(lambda: "Fetching quotes from Goodreads for author=%s" % author) url = iri2uri("https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s" % author) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/search.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri("https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s&page=%d" % (author, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def get_for_author(self, author): logger.info(lambda: "Fetching quotes from Goodreads for author=%s" % author) url = iri2uri(u"https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s" % author) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/search.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri(u"https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s&page=%d" % (author, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def get_for_keyword(self, keyword): logger.info(lambda: "Fetching quotes from Goodreads for keyword=%s" % keyword) url = iri2uri(u"https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s" % keyword) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/tag.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri(u"https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s&page=%d" % (keyword, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def get_for_keyword(self, keyword): logger.info(lambda: "Fetching quotes from Goodreads for keyword=%s" % keyword) url = iri2uri("https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s" % keyword) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/tag.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri("https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s&page=%d" % (keyword, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def fetch_goodreads_quotes(self): BASE_URL = 'https://www.goodreads.com/quotes' self.quotes = [] # iterate through goodreads pagination for i in range(1, 20): query = '?page=' + str(i) url = BASE_URL + query bs = Util.html_soup(url) # this is the element that contains the quote text quoteElems = bs.find_all("div", {"class": "quoteText"}) # process the selected elems to get the quote text for tag in quoteElems: # ignore the tag that contains the author info quote_stripped_author = tag.contents[:-1] quoteFragments = [] self._create_quote_fragments(quoteFragments, quote_stripped_author) # create a new entry new_quote = self.assemble_quote(tag, quoteFragments, url) self.quotes.append(new_quote) if not self.quotes: logger.warning("Could not find quotes for URL " + BASE_URL)
def get_inspired_quotes(self): self.quotes = [] url = "http://feeds.lds.org/lds-inspirational-messages-eng" bs = Util.html_soup(url) items = bs.select("item") for item in items: formatted_item = BeautifulSoup(item.prettify(formatter=None)) quote = formatted_item.find("blockquote") sQuote = quote.get_text().encode('utf-8') author = formatted_item.find("p") link = author.find("a").get('href') sAuthor = author.get_text().encode('utf-8') sAuthor = sAuthor[3:] if (sAuthor.find(',') != -1): sAuthor = sAuthor[:sAuthor.index(',')] if (sAuthor.find('Topics') != -1): sAuthor = sAuthor[:sAuthor.index('Topics')] tempitem = {"quote": sQuote, "author": sAuthor, "sourceName": "LDS.org", "link": link} self.quotes.append(tempitem) if not self.quotes: logger.warning("Could not find quotes for URL " + url)
def download_queue_item(self, queue_item): wallpaper_url = queue_item logger.info(lambda: "Wallpaper URL: " + wallpaper_url) s = Util.html_soup(wallpaper_url) src_url = s.find('img', id='wallpaper')['src'] logger.info(lambda: "Image src URL: " + src_url) extra_metadata = {} try: extra_metadata['keywords'] = [el.text.strip() for el in s.find_all('a', {'class':'tagname'})] except: pass try: purity = s.find('div', 'sidebar-content').find('label', 'purity').text.lower() sfw_rating = {'sfw': 100, 'sketchy': 50, 'nsfw': 0}[purity] extra_metadata['sfwRating'] = sfw_rating if self.is_safe_mode_enabled() and sfw_rating < 100: logger.info(lambda: "Skipping non-safe download from Wallhaven. " "Is the source %s suitable for Safe mode?" % self.config) return None except: pass return self.save_locally(wallpaper_url, src_url, extra_metadata=extra_metadata)
def fill_queue(self): logger.info(lambda: "ArtStation URL: " + self.config) queue = [] # json_url = ArtStationDownloader.build_json_url(self.config) url = self.config s = Util.html_soup(url) author = s.find("channel").find("title").get_text().strip() author_url = s.find("channel").find("link").next.strip() items = s.findAll("item") for index, item in enumerate(items): try: extra_metadata = { "headline": item.find("title").get_text().strip(), "description": item.find("description").get_text().strip().replace("]]>", ""), "author": author, "authorURL": author_url, } src_url = item.find("guid").text + "#" + str(index) image_urls = [img["src"] for img in item.findAll("img")] for image_url in image_urls: queue.append((src_url, image_url, extra_metadata)) except Exception: logger.exception(lambda: "Could not process an item in the ArtStation rss result") random.shuffle(queue) return queue
def download_queue_item(self, queue_item): wallpaper_url = queue_item logger.info(lambda: "Wallpaper URL: " + wallpaper_url) s = Util.html_soup(wallpaper_url) src_url = s.find("img", id="wallpaper")["src"] logger.info(lambda: "Image src URL: " + src_url) extra_metadata = {} try: extra_metadata["keywords"] = [ el.text.strip() for el in s.find_all("a", {"class": "tagname"}) ] except: pass try: purity = s.find("div", "sidebar-content").find("label", "purity").text.lower() sfw_rating = {"sfw": 100, "sketchy": 50, "nsfw": 0}[purity] extra_metadata["sfwRating"] = sfw_rating if self.is_safe_mode_enabled() and sfw_rating < 100: logger.info( lambda: "Skipping non-safe download from Wallhaven. " "Is the source %s suitable for Safe mode?" % self.config) return None except: pass return self.save_locally(wallpaper_url, src_url, extra_metadata=extra_metadata)
def download_one(self): min_download_interval, min_fill_queue_interval = self.parse_server_options("wallhaven", 0, 0) if time.time() - WallhavenDownloader.last_download_time < min_download_interval: logger.info( lambda: "Minimal interval between Wallhaven downloads is %d, skip this attempt" % min_download_interval ) return None logger.info(lambda: "Downloading an image from Wallhaven.cc, " + self.location) logger.info(lambda: "Queue size: %d" % len(self.queue)) if not self.queue: if time.time() - self.last_fill_time < min_fill_queue_interval: logger.info( lambda: "Wallhaven queue empty, but minimal interval between fill attempts is %d, " "will try again later" % min_fill_queue_interval ) return None self.fill_queue() if not self.queue: logger.info(lambda: "Wallhaven queue still empty after fill request") return None WallhavenDownloader.last_download_time = time.time() wallpaper_url = self.queue.pop() logger.info(lambda: "Wallpaper URL: " + wallpaper_url) s = Util.html_soup(wallpaper_url) src_url = s.find("img", id="wallpaper")["src"] logger.info(lambda: "Image src URL: " + src_url) extra_metadata = {} try: extra_metadata["keywords"] = [el.text.strip() for el in s.find_all("a", {"class": "tagname"})] except: pass try: purity = s.find("div", "sidebar-content").find("label", "purity").text.lower() sfw_rating = {"sfw": 100, "sketchy": 50, "nsfw": 0}[purity] extra_metadata["sfwRating"] = sfw_rating if self.parent and self.parent.options.safe_mode and sfw_rating < 100: logger.info( lambda: "Skipping non-safe download from Wallhaven. " "Is the source %s suitable for Safe mode?" % self.location ) return None except: pass return self.save_locally(wallpaper_url, src_url, extra_metadata=extra_metadata)
def fill_queue(self): logger.info(lambda: "Filling APOD queue from Archive") s = Util.html_soup(self.ROOT_URL + "archivepix.html") urls = [ self.ROOT_URL + x["href"] for x in s.findAll("a") if x["href"].startswith("ap") and x["href"].endswith(".html") ] urls = urls[:730] # leave only last 2 years' pics urls = [x for x in urls if not self.is_in_banned(x)] queue = urls[:3] # always put the latest 3 first urls = urls[3:] random.shuffle(urls) # shuffle the rest queue.extend(urls) return queue
def search(self, page=None): url = self.url if page: url = url + ("&" if "?" in self.url else "?") + "page=" + str(page) logger.info(lambda: "Performing wallhaven search: url=%s" % url) soup = Util.html_soup(url) result_count = None try: result_count = int(soup.find("header", {"class": "listing-header"}).find("h1").text.split()[0]) except: pass return soup, result_count
def search(self, page=None): url = self.url if page: url = url + ("&" if "?" in self.url else "?") + "page=" + str(page) logger.info(lambda: "Performing wallhaven search: url=%s" % url) soup = Util.html_soup(url) result_count = None try: result_count = int(soup.find('header', {'class': 'listing-header'}) .find('h1').text.split()[0].replace(',', '')) except: pass return soup, result_count
def download_queue_item(self, queue_item): origin_url = queue_item logger.info(lambda: "APOD URL: " + origin_url) s = Util.html_soup(origin_url) img_url = None try: link = s.find("img").parent["href"] if link.startswith("image/"): img_url = self.ROOT_URL + link logger.info(lambda: "Image URL: " + img_url) except Exception: pass if img_url: return self.save_locally(origin_url, img_url, source_location=self.ROOT_URL) else: logger.info(lambda: "No image url found for this APOD URL") return None
def fetch(url, xml=False): return Util.xml_soup(url) if xml else Util.html_soup(url)
def download_one(self): min_download_interval, min_fill_queue_interval = self.parse_server_options( "wallhaven", 0, 0) if time.time( ) - WallhavenDownloader.last_download_time < min_download_interval: logger.info( lambda: "Minimal interval between Wallhaven downloads is %d, skip this attempt" % min_download_interval) return None logger.info( lambda: "Downloading an image from Wallhaven.cc, " + self.location) logger.info(lambda: "Queue size: %d" % len(self.queue)) if not self.queue: if time.time() - self.last_fill_time < min_fill_queue_interval: logger.info( lambda: "Wallhaven queue empty, but minimal interval between fill attempts is %d, " "will try again later" % min_fill_queue_interval) return None self.fill_queue() if not self.queue: logger.info( lambda: "Wallhaven queue still empty after fill request") return None WallhavenDownloader.last_download_time = time.time() wallpaper_url = self.queue.pop() logger.info(lambda: "Wallpaper URL: " + wallpaper_url) s = Util.html_soup(wallpaper_url) src_url = s.find('img', id='wallpaper')['src'] logger.info(lambda: "Image src URL: " + src_url) extra_metadata = {} try: extra_metadata['keywords'] = [ el.text.strip() for el in s.find_all('a', {'class': 'tagname'}) ] except: pass try: purity = s.find('div', 'sidebar-content').find('label', 'purity').text.lower() sfw_rating = {'sfw': 100, 'sketchy': 50, 'nsfw': 0}[purity] extra_metadata['sfwRating'] = sfw_rating if self.parent and self.parent.options.safe_mode and sfw_rating < 100: logger.info( lambda: "Skipping non-safe download from Wallhaven. " "Is the source %s suitable for Safe mode?" % self.location) return None except: pass return self.save_locally(wallpaper_url, src_url, extra_metadata=extra_metadata)