예제 #1
0
def main():
    """ Main program. """

    try:
        config()
    except configparser.Error as what:
        error("Error in configuration file: %s", str(what))
        return 1

    Logger.set_log_level(options.verbose)

    options.types = options.types or ['all']
    options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES,
                                                BUILD_ORDER)
    debug("Building types: %s" % ' '.join(options.types))

    ParserFactory.load_parsers()
    WriterFactory.load_writers()
    PackagerFactory.load_packagers()

    if options.is_job_queue:
        job_queue = cPickle.load(sys.stdin.buffer)  # read bytes
    else:
        options.dc = get_dc(options.url)
        job_queue = []
        output_files = dict()
        for type_ in options.types:
            job = CommonCode.Job(type_)
            job.url = options.url
            job.ebook = options.ebook
            job.dc = options.dc
            job.outputdir = options.outputdir
            job.outputfile = options.outputfile or make_output_filename(
                type_, options.dc)
            output_files[type_] = job.outputfile

            if job.type == 'kindle.images':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.images'])
            elif job.type == 'kindle.noimages':
                job.url = os.path.join(job.outputdir,
                                       output_files['epub.noimages'])

            job_queue.append(job)

    for j in job_queue:
        do_job(j)

    packager = PackagerFactory.create(options.packager, 'push')
    if packager:
        # HACK: the WWers ever only convert one ebook at a time
        job = job_queue[0]
        job.outputfile = '%d-final.zip' % (options.dc.project_gutenberg_id)
        packager.package(job)

    return 0
예제 #2
0
def do_job(job):
    """ Do one job. """

    log_handler = None
    Logger.ebook = job.ebook
    if job.logfile:
        log_handler = open_log(
            os.path.join(os.path.abspath(job.outputdir), job.logfile))

    debug('=== Building %s ===' % job.type)
    start_time = datetime.datetime.now()
    try:
        if job.url:
            spider = Spider.Spider()
            dirpath = os.path.dirname(job.url)  # platform native path
            spider.include_urls += (options.include_urls
                                    or [parsers.webify_url(dirpath) + '/*']
                                    )  # use for parser only

            spider.include_mediatypes += options.include_mediatypes
            if job.subtype == '.images' or job.type == 'rst.gen':
                spider.include_mediatypes.append('image/*')

            spider.exclude_urls += options.exclude_urls

            spider.exclude_mediatypes += options.exclude_mediatypes

            spider.max_depth = options.max_depth or six.MAXSIZE

            for rewrite in options.rewrite:
                from_url, to_url = rewrite.split('>')
                spider.add_redirection(from_url, to_url)

            attribs = parsers.ParserAttributes()
            attribs.url = parsers.webify_url(job.url)
            attribs.id = 'start'

            if options.input_mediatype:
                attribs.orig_mediatype = attribs.HeaderElement.from_str(
                    options.input_mediatype)

            spider.recursive_parse(attribs)
            elect_coverpage(spider, job.url)
            job.url = spider.redirect(job.url)
            job.base_url = job.url
            job.spider = spider

        writer = WriterFactory.create(job.maintype)
        writer.build(job)

        if options.validate:
            writer.validate(job)

        packager = PackagerFactory.create(options.packager, job.type)
        if packager:
            packager.package(job)

        if job.type == 'html.images':
            # FIXME: hack for push packager
            options.html_images_list = list(job.spider.aux_file_iter())

    except SkipOutputFormat as what:
        warning("%s" % what)

    except Exception as what:
        exception("%s" % what)

    end_time = datetime.datetime.now()
    info(' %s made in %s' % (job.type, end_time - start_time))

    if log_handler:
        close_log(log_handler)
        log_handler = None