def run(self): check_or_make_dir(self.deviant_dir) # First get the rss feed which lists the deviations rss_xml = yield from self.fetch_rss() # Visit each deviation serially and get the page html for dev in self.scrape_deviations_list(rss_xml, self._rss_namespaces): self.info(dev.url) if dev.rating == "adult": # TODO: Handle mature deviations # Ignore for now self.warn("Ignoring mature deviation [%s]" % dev.url) continue dev_page_html = yield from self.fetch_deviation_page(dev.url) if dev.medium == "image": image_url = self.scrape_deviation_image_url(dev.guid, dev_page_html) image_filename = filename_from_url(image_url) yield from self.download_deviation(image_url, image_filename) elif dev.medium == "document": # TODO: Handle text deviation self.warn("Ignoring text deviation %s" % dev.url) elif not dev.medium: self.warn("Media type not specified %s" % dev.url) else: raise ScrapingException("Unknown medium type %s for %s" % (dev.medium, dev.url)) yield from sleep(0.001) self.info("Done")
def run(self): check_or_make_dir(self.artist_dir) projects = yield from self.fetch_projects() for project in projects: self.info(project.title) for image_url in (yield from self.fetch_project_image_url(project)): filename = filename_from_url(image_url) filepath = os.path.join(self.artist_dir, filename) yield from self.download(image_url, filepath, self.overwrite) sleep(0.1)
def run(self): check_or_make_dir(self.tumblr_dir) pagenum = 1 while True: page_html = yield from self.fetch_page(pagenum) posts = self.scrape_posts(page_html) if not posts: # No posts were found. We've probably reached an out of range page break for post in posts: post_html = yield from self.fetch_post(post.url) for image_url in self.scrape_images(post_html): file_name = filename_from_url(image_url) yield from self.download_image(image_url, file_name) pagenum += 1
def run(self): check_or_make_dir(self.project_dir) projects = yield from self.fetch_projects() for project in projects: self.info(project.original_image) image_url = project.original_image filename = filename_from_url(image_url) file_path = os.path.join(self.project_dir, filename) os.path.join(self.project_dir, file_path) yield from self.download(image_url, file_path, self.overwrite) # Can only guess file extension after file is done downloading if "." not in filename: file_ext = self.guess_img_ext(file_path) move(file_path, file_path + "." + file_ext) yield from sleep(0.001) self.info("Done")