示例#1
0
class ReviewSpider(scrapy.Spider):

    name = 'reviewSpider'
    start_urls = [site[1] for site in sites.get()]

    def parse(self, response):
        csvwriter = csv.writer(open('reviews.csv', 'a'))
        for review in response.css('[data-hook=review]'):
            csvwriter.writerow([
                review.css('.a-profile .a-profile-name::text').get().encode(
                    'utf-8', 'ignore'),
                review.css('.a-profile::attr("href")').get(),
                review.css(
                    '[data-hook=review-title] .cr-original-review-content::text'
                ).get().encode('utf-8', 'ignore'),
                review.css(
                    '[data-hook=review-title]::attr("href")').get().encode(
                        'utf-8', 'ignore'),
                review.css(
                    '[data-hook=review-body] .cr-original-review-content::text'
                ).get().encode('utf-8', 'ignore')
            ])

        next_page = response.css(
            '[data-hook=pagination-bar] li.a-last a::attr("href")').get()

        if next_page is not None:
            print("Getting next_page!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            yield response.follow(next_page, self.parse)
示例#2
0
def leech(url, session, filename=None, args=None):
    # we have: a page, which could be absolutely any part of a story, or not a story at all
    # check a bunch of things which are completely ff.n specific, to get text from it
    site, url = sites.get(url)
    if not site:
        raise Exception("No site handler found")

    logger.info("Handler: %s (%s)", site, url)

    handler = site(session, args=args)

    with open('leech.json') as config_file:
        config = json.load(config_file)

        login = config.get('logins', {}).get(site.__name__, False)
        if login:
            handler.login(login)

        cover_options = config.get('cover', {})

    story = handler.extract(url)
    if not story:
        raise Exception("Couldn't extract story")

    return ebook.generate_epub(story, filename, cover_options=cover_options)
示例#3
0
def leech(url, session, filename=None, args=None):
    # we have: a page, which could be absolutely any part of a story, or not a story at all
    # check a bunch of things which are completely ff.n specific, to get text from it
    site, url = sites.get(url)
    if not site:
        raise Exception("No site handler found")

    print("Handler", site, url)

    handler = site(session, args=args)

    with open('leech.json') as store_file:
        store = json.load(store_file)
        login = store.get('logins', {}).get(site.__name__, False)
        if login:
            handler.login(login)

    story = handler.extract(url)
    if not story:
        raise Exception("Couldn't extract story")

    dates = list(story.dates())
    metadata = {
        'title': story.title,
        'author': story.author,
        'unique_id': url,
        'started': min(dates),
        'updated': max(dates),
    }

    # The cover is static, and the only change comes from the image which we generate
    html = [('Cover', 'cover.html', cover_template)]
    cover_image = ('images/cover.png',
                   cover.make_cover(story.title,
                                    story.author).read(), 'image/png')

    html.append(('Front Matter', 'frontmatter.html',
                 frontmatter_template.format(now=datetime.datetime.now(),
                                             **metadata)))

    html.extend(chapter_html(story))

    css = (
        'Styles/base.css',
        session.get(
            'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css'
        ).text, 'text/css')

    filename = filename or story.title + '.epub'

    # print([c[0:-1] for c in html])
    filename = epub.make_epub(filename,
                              html,
                              metadata,
                              extra_files=(css, cover_image))

    return filename
示例#4
0
def download(url, site_options, cache, verbose, **other_flags):
    """Downloads a story and saves it on disk as a ebpub ebook."""
    configure_logging(verbose)
    session = create_session(cache)

    site, url = sites.get(url)
    options, login = create_options(site, site_options, other_flags)
    story = open_story(site, url, session, login, options)

    filename = ebook.generate_epub(story)
    logger.info("File created: " + filename)
示例#5
0
文件: leech.py 项目: kemayo/leech
def download(url, site_options, cache, verbose, **other_flags):
    """Downloads a story and saves it on disk as a ebpub ebook."""
    configure_logging(verbose)
    session = create_session(cache)

    site, url = sites.get(url)
    options, login = create_options(site, site_options, other_flags)
    story = open_story(site, url, session, login, options)

    filename = ebook.generate_epub(story, options)
    logger.info("File created: " + filename)
示例#6
0
def leech(url, session, filename=None, args=None):
    # we have: a page, which could be absolutely any part of a story, or not a story at all
    # check a bunch of things which are completely ff.n specific, to get text from it
    site = sites.get(url)
    if not site:
        raise Exception("No site handler found")

    handler = site(session, args=args)

    with open('leech.json') as store_file:
        store = json.load(store_file)
        login = store.get('logins', {}).get(site.__name__, False)
        if login:
            handler.login(login)

    story = handler.extract(url)
    if not story:
        raise Exception("Couldn't extract story")

    dates = list(story.dates())
    metadata = {
        'title': story.title,
        'author': story.author,
        'unique_id': url,
        'started': min(dates),
        'updated': max(dates),
    }

    # The cover is static, and the only change comes from the image which we generate
    html = [('Cover', 'cover.html', cover_template)]
    cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')

    html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))

    html.extend(chapter_html(story))

    css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')

    filename = filename or story.title + '.epub'

    # print([c[0:-1] for c in html])
    filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))

    return filename
示例#7
0
def snapin_site_choice(ident, choices):
    sites = config.user.load_file("sidebar_sites", {})
    site = sites.get(ident, "")
    if site == "":
        only_sites = None
    else:
        only_sites = [site]

    site_choices = config.get_event_console_site_choices()
    if len(site_choices) <= 1:
        return None

    site_choices = [
        ("", _("All sites")),
    ] + site_choices
    onchange = "set_snapin_site(event, %s, this)" % json.dumps(ident)
    html.dropdown("site", site_choices, deflt=site, onchange=onchange)

    return only_sites