class ReviewSpider(scrapy.Spider): name = 'reviewSpider' start_urls = [site[1] for site in sites.get()] def parse(self, response): csvwriter = csv.writer(open('reviews.csv', 'a')) for review in response.css('[data-hook=review]'): csvwriter.writerow([ review.css('.a-profile .a-profile-name::text').get().encode( 'utf-8', 'ignore'), review.css('.a-profile::attr("href")').get(), review.css( '[data-hook=review-title] .cr-original-review-content::text' ).get().encode('utf-8', 'ignore'), review.css( '[data-hook=review-title]::attr("href")').get().encode( 'utf-8', 'ignore'), review.css( '[data-hook=review-body] .cr-original-review-content::text' ).get().encode('utf-8', 'ignore') ]) next_page = response.css( '[data-hook=pagination-bar] li.a-last a::attr("href")').get() if next_page is not None: print("Getting next_page!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") yield response.follow(next_page, self.parse)
def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it site, url = sites.get(url) if not site: raise Exception("No site handler found") logger.info("Handler: %s (%s)", site, url) handler = site(session, args=args) with open('leech.json') as config_file: config = json.load(config_file) login = config.get('logins', {}).get(site.__name__, False) if login: handler.login(login) cover_options = config.get('cover', {}) story = handler.extract(url) if not story: raise Exception("Couldn't extract story") return ebook.generate_epub(story, filename, cover_options=cover_options)
def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it site, url = sites.get(url) if not site: raise Exception("No site handler found") print("Handler", site, url) handler = site(session, args=args) with open('leech.json') as store_file: store = json.load(store_file) login = store.get('logins', {}).get(site.__name__, False) if login: handler.login(login) story = handler.extract(url) if not story: raise Exception("Couldn't extract story") dates = list(story.dates()) metadata = { 'title': story.title, 'author': story.author, 'unique_id': url, 'started': min(dates), 'updated': max(dates), } # The cover is static, and the only change comes from the image which we generate html = [('Cover', 'cover.html', cover_template)] cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png') html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) html.extend(chapter_html(story)) css = ( 'Styles/base.css', session.get( 'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css' ).text, 'text/css') filename = filename or story.title + '.epub' # print([c[0:-1] for c in html]) filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image)) return filename
def download(url, site_options, cache, verbose, **other_flags): """Downloads a story and saves it on disk as a ebpub ebook.""" configure_logging(verbose) session = create_session(cache) site, url = sites.get(url) options, login = create_options(site, site_options, other_flags) story = open_story(site, url, session, login, options) filename = ebook.generate_epub(story) logger.info("File created: " + filename)
def download(url, site_options, cache, verbose, **other_flags): """Downloads a story and saves it on disk as a ebpub ebook.""" configure_logging(verbose) session = create_session(cache) site, url = sites.get(url) options, login = create_options(site, site_options, other_flags) story = open_story(site, url, session, login, options) filename = ebook.generate_epub(story, options) logger.info("File created: " + filename)
def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it site = sites.get(url) if not site: raise Exception("No site handler found") handler = site(session, args=args) with open('leech.json') as store_file: store = json.load(store_file) login = store.get('logins', {}).get(site.__name__, False) if login: handler.login(login) story = handler.extract(url) if not story: raise Exception("Couldn't extract story") dates = list(story.dates()) metadata = { 'title': story.title, 'author': story.author, 'unique_id': url, 'started': min(dates), 'updated': max(dates), } # The cover is static, and the only change comes from the image which we generate html = [('Cover', 'cover.html', cover_template)] cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png') html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) html.extend(chapter_html(story)) css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') filename = filename or story.title + '.epub' # print([c[0:-1] for c in html]) filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image)) return filename
def snapin_site_choice(ident, choices): sites = config.user.load_file("sidebar_sites", {}) site = sites.get(ident, "") if site == "": only_sites = None else: only_sites = [site] site_choices = config.get_event_console_site_choices() if len(site_choices) <= 1: return None site_choices = [ ("", _("All sites")), ] + site_choices onchange = "set_snapin_site(event, %s, this)" % json.dumps(ident) html.dropdown("site", site_choices, deflt=site, onchange=onchange) return only_sites