Пример #1
0
    def getUserConfig(self,user,url,fileformat):

        configuration = Configuration(adapters.getConfigSectionsFor(url),fileformat)

        logging.debug('reading defaults.ini config file')
        configuration.read('fanficfare/defaults.ini')

        ## Pull user's config record.
        l = UserConfig.all().filter('user ='******'reading config from UserConfig(%s)'%uconfig.config)
            configuration.readfp(StringIO(uconfig.config))

        return configuration
 def setup_method(self):
     self.url = 'https://chireads.com/category/translatedtales/some-story/'
     self.chapter_url = 'https://chireads.com/translatedtales/chapitre-1-some-title/2020/02/08/'
     self.configuration = Configuration(["chireads.com"],
                                        "EPUB",
                                        lightweight=True)
     self.chireads = chiread(self.configuration, self.url)
Пример #3
0
 def setup_method(self, adapter, url, sections, path_adapter,
                  adapter_fixture):
     self.url = url
     self.configuration = Configuration(sections, "EPUB", lightweight=True)
     self.adapter = adapter(self.configuration, self.url)
     self.path_adapter = path_adapter
     self.fixture = adapter_fixture
Пример #4
0
def get_urls_from_text(data,configuration=None,normalize=False):
    urls = collections.OrderedDict()
    try:
        data = unicode(data)
    except UnicodeDecodeError:
        data=data.decode('utf8') ## for when called outside calibre.

    if not configuration:
        configuration = Configuration(["test1.com"],"EPUB",lightweight=True)
    
    for href in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data):
        # this (should) catch normal story links, some javascript
        # 'are you old enough' links, and 'Report This' links.
        if 'story.php' in href:
            m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",href)
            if m != None:
                href = form_url(href,m.group('sid'))
        try:
            href = href.replace('&index=1','')
            adapter = adapters.getAdapter(configuration,href)
            if adapter.story.getMetadata('storyUrl') not in urls:
                urls[adapter.story.getMetadata('storyUrl')] = [href]
            else:
                urls[adapter.story.getMetadata('storyUrl')].append(href)
        except:
            pass

    # Simply return the longest URL with the assumption that it contains the
    # most user readable metadata, if not normalized
    return urls.keys() if normalize else [max(value, key=len) for key, value in urls.items()]
Пример #5
0
 def get_config(self):
     """
     Return the fanficfare configuration, creating it if neccessary.
     
     @return: the configuration, possibly cached
     @rtype: L{fanficfare.configurable.Configuration}
     """
     if self.config is None:
         self.config = Configuration(["zimhtml"], "epub")
         try:
             self.config.add_section('overrides')
         except configparser.DuplicateSectionError:
             # generally already exists in defaults.ini
             pass
         if self.include_images:
             self.config.set("overrides", "include_images", "true")
     return self.config
Пример #6
0
def fetch_metadata(url: str, chapters=True) -> bytes:
    configuration = Configuration(adapters.getConfigSectionsFor(url), 'epub')
    adapter = adapters.getAdapter(configuration, url)
    adapter.is_adult = True
    metadata = adapter.getStoryMetadataOnly().getAllMetadata()

    if chapters:
        metadata['zchapters'] = []
        for i, chap in enumerate(adapter.get_chapters()):
            metadata['zchapters'].append((i + 1, chap))

    return metadata
Пример #7
0
 def __init__(self, url):
     if isinstance(url, Target):
         url = url.url
     self.url = url
     configuration = Configuration(["test1.com"], "HTML", lightweight=True)
     try:
         adapter = adapters.getAdapter(configuration, url)
     except UnknownSite:
         raise NotAValidTarget(url)
     self.abbrev = adapter.story.getMetadata("siteabbrev")
     if self.abbrev is None:
         self.abbrev = "unknown"
     self.id = adapter.story.getMetadata("storyId")
     if self.id is None:
         self.id = self._id_from_url(url)
Пример #8
0
def from_url(url: HttpUrl):
    fff_works = False
    try:
        configuration = Configuration(adapters.getConfigSectionsFor(str(url)),
                                      'epub')
        fff_works = True
    except exceptions.UnknownSite:
        pass

    generator = mapping.get(url.host)
    if generator:
        return generator
    elif fff_works:
        return fanficfare_generator
    raise Exception()
Пример #9
0
def get_urls_from_page(url,configuration=None,normalize=False):

    if not configuration:
        configuration = Configuration(["test1.com"],"EPUB",lightweight=True)

    data = None
    adapter = None
    try:
        adapter = adapters.getAdapter(configuration,url,anyurl=True)
        
        # special stuff to log into archiveofourown.org, if possible.
        # Unlike most that show the links to 'adult' stories, but protect
        # them, AO3 doesn't even show them if not logged in.  Only works
        # with saved user/pass--not going to prompt for list.
        if 'archiveofourown.org' in url:
            if adapter.getConfig("username"):
                if adapter.getConfig("is_adult"):
                    if '?' in url:
                        addurl = "&view_adult=true"
                    else:
                        addurl = "?view_adult=true"
                else:
                    addurl=""
                # just to get an authenticity_token.
                data = adapter._fetchUrl(url+addurl)
                # login the session.
                adapter.performLogin(url,data)
                # get the list page with logged in session.
                
        if 'fimfiction.net' in url and adapter.getConfig("is_adult"):
            data = adapter._fetchUrl(url)
            adapter.set_adult_cookie()
    
        # this way it uses User-Agent or other special settings.  Only AO3
        # is doing login.
        data = adapter._fetchUrl(url,usecache=False)
    except UnknownSite:
        # no adapter with anyurl=True, must be a random site.
        opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
        data = opener.open(url).read()

    # kludge because I don't see it on enough sites to be worth generalizing yet.
    restrictsearch=None
    if 'scarvesandcoffee.net' in url:
        restrictsearch=('div',{'id':'mainpage'})

    return get_urls_from_html(data,url,configuration,normalize,restrictsearch)
Пример #10
0
 def getNormalStoryURLSite(url):
     # print("getNormalStoryURLSite:%s"%url)
     if not adapters.gerNormalStoryURL.__dummyconfig:
         adapters.getNormalStoryURL.__dummyconfig = Configuration(
             ["test1.com"], "EPUB", lightweight=True)
     # pulling up an adapter is pretty low over-head.  If
     # it fails, it's a bad url.
     try:
         adapter = adapters.getAdapter(
             adapters.getNormalStoryURL.__dummyconfig, url)
         url = adapter.url
         site = adapter.getSiteDomain()
         storyid = adapter.story.getMetadata('storyId')
         del adapter
         return (url, site, storyid)
     except:
         return None
Пример #11
0
def get_fff_config(url,fileform="epub",personalini=None):
    if not personalini:
        personalini = get_fff_personalini()
    sections=['unknown']
    try:
        sections = adapters.getConfigSectionsFor(url)
    except Exception as e:
        logger.debug("Failed trying to get ini config for url(%s): %s, using section %s instead"%(url,e,sections))
    configuration = Configuration(sections,fileform)
    configuration.readfp(StringIO(ensure_text(get_resources("plugin-defaults.ini"))))
    configuration.readfp(StringIO(ensure_text(personalini)))

    return configuration
Пример #12
0
    def getUserConfig(self,user,url,fileformat):

        configuration = Configuration(adapters.getConfigSectionsFor(url),fileformat)

        logging.debug('reading defaults.ini config file')
        configuration.read('fanficfare/defaults.ini')

        ## Pull user's config record.
        l = UserConfig.all().filter('user ='******'reading config from UserConfig(%s)'%uconfig.config)
            configuration.readfp(StringIO(uconfig.config))

        return configuration
Пример #13
0
def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrictsearch=None):
    urls = collections.OrderedDict()

    if not configuration:
        configuration = Configuration(["test1.com"],"EPUB",lightweight=True)

    soup = BeautifulSoup(data,"html5lib")
    if restrictsearch:
        soup = soup.find(*restrictsearch)
        #logger.debug("restrict search:%s"%soup)
    
    for a in soup.findAll('a'):
        if a.has_attr('href'):
            #logger.debug("a['href']:%s"%a['href'])
            href = form_url(url,a['href'])
            #logger.debug("1 urlhref:%s"%href)
            # this (should) catch normal story links, some javascript
            # 'are you old enough' links, and 'Report This' links.
            if 'story.php' in a['href']:
                #logger.debug("trying:%s"%a['href'])
                m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",a['href'])
                if m != None:
                    href = form_url(a['href'] if '//' in a['href'] else url,
                                    m.group('sid'))
                    
            try:
                href = href.replace('&index=1','')
                #logger.debug("2 urlhref:%s"%href)
                adapter = adapters.getAdapter(configuration,href)
                #logger.debug("found adapter")
                if adapter.story.getMetadata('storyUrl') not in urls:
                    urls[adapter.story.getMetadata('storyUrl')] = [href]
                else:
                    urls[adapter.story.getMetadata('storyUrl')].append(href)
            except Exception, e:
                #logger.debug e
                pass
Пример #14
0
def get_configuration(url,
                      passed_defaultsini,
                      passed_personalini,
                      options,
                      chaptercount=None,
                      output_filename=None):
    try:
        configuration = Configuration(adapters.getConfigSectionsFor(url),
                                      options.format)
    except exceptions.UnknownSite as e:
        if options.list or options.normalize or options.downloadlist:
            # list for page doesn't have to be a supported site.
            configuration = Configuration(['unknown'], options.format)
        else:
            raise

    conflist = []
    homepath = join(expanduser('~'), '.fanficdownloader')
    ## also look for .fanficfare now, give higher priority than old dir.
    homepath2 = join(expanduser('~'), '.fanficfare')
    xdgpath = os.environ.get('XDG_CONFIG_HOME', join(expanduser('~'),
                                                     '.config'))
    xdgpath = join(xdgpath, 'fanficfare')

    if passed_defaultsini:
        # new StringIO each time rather than pass StringIO and rewind
        # for case of list download.  Just makes more sense to me.
        configuration.readfp(StringIO(unicode(passed_defaultsini)))
    else:
        # don't need to check existance for our selves.
        conflist.append(join(dirname(__file__), 'defaults.ini'))
        conflist.append(join(homepath, 'defaults.ini'))
        conflist.append(join(homepath2, 'defaults.ini'))
        conflist.append(join(xdgpath, 'defaults.ini'))
        conflist.append('defaults.ini')

    if passed_personalini:
        # new StringIO each time rather than pass StringIO and rewind
        # for case of list download.  Just makes more sense to me.
        configuration.readfp(StringIO(unicode(passed_personalini)))

    conflist.append(join(homepath, 'personal.ini'))
    conflist.append(join(homepath2, 'personal.ini'))
    conflist.append(join(xdgpath, 'personal.ini'))
    conflist.append('personal.ini')

    if options.configfile:
        conflist.extend(options.configfile)

    configuration.read(conflist)

    try:
        configuration.add_section('overrides')
    except configparser.DuplicateSectionError:
        # generally already exists in defaults.ini
        pass

    if options.force:
        configuration.set('overrides', 'always_overwrite', 'true')

    if options.update and chaptercount and output_filename:
        configuration.set('overrides', 'output_filename', output_filename)

    if options.update and not options.updatecover:
        configuration.set('overrides', 'never_make_cover', 'true')

    # images only for epub, even if the user mistakenly turned it
    # on else where.
    if options.format not in ('epub', 'html'):
        configuration.set('overrides', 'include_images', 'false')

    if options.options:
        for opt in options.options:
            (var, val) = opt.split('=')
            configuration.set('overrides', var, val)

    if options.progressbar:
        configuration.set('overrides', 'progressbar', 'true')

    ## do page cache and cookie load after reading INI files because
    ## settings (like use_basic_cache) matter.

    ## only need browser cache if one of the URLs needs it, and it
    ## isn't saved or dependent on options.save_cache.  This needs to
    ## be above basic_cache to avoid loading more than once anyway.
    if configuration.getConfig('use_browser_cache'):
        if not hasattr(options, 'browser_cache'):
            configuration.get_fetcher()  # force browser cache read.
            options.browser_cache = configuration.get_browser_cache()
        else:
            configuration.set_browser_cache(options.browser_cache)

    ## Share basic_cache between multiple downloads.
    if not hasattr(options, 'basic_cache'):
        options.basic_cache = configuration.get_basic_cache()
        if options.save_cache:
            try:
                options.basic_cache.load_cache(global_cache)
            except Exception as e:
                logger.warning(
                    "Didn't load --save-cache %s\nContinue without loading BasicCache"
                    % e)
            options.basic_cache.set_autosave(True, filename=global_cache)
    else:
        configuration.set_basic_cache(options.basic_cache)
    # logger.debug(options.basic_cache.basic_cache.keys())

    ## All CLI downloads are sequential and share one cookiejar,
    ## loaded the first time through here.
    if not hasattr(options, 'cookiejar'):
        options.cookiejar = configuration.get_cookiejar()
        if options.save_cache:
            try:
                options.cookiejar.load_cookiejar(global_cookies)
            except Exception as e:
                logger.warning(
                    "Didn't load --save-cache %s\nContinue without loading cookies"
                    % e)
            options.cookiejar.set_autosave(True, filename=global_cookies)
    else:
        configuration.set_cookiejar(options.cookiejar)

    return configuration
 def setup_method(self):
     self.url = 'https://chireads.com/category/translatedtales/some-story/'
     self.configuration = Configuration(["chireads.com"],
                                        "EPUB",
                                        lightweight=True)
     self.chireads = chiread(self.configuration, self.url)
Пример #16
0
class Html2EpubConverter(object):
    """
    The HTML to EPUB converter.
    
    @param path: path to dir containing story and metadata
    @type path: L{str}
    @param include_images: if nonzero, include images in epub
    @type include_images: L{bool}
    
    @cvar IGNORE_METADATA_KEYS: tuple of metadata keys to ignore
    @type IGNORE_METADATA_KEYS: L{tuple} of L{str}
    @cvar CHAPTER_NAME_LINK_REGEX: regex to use to identify hrefs of chapter links
    @type CHAPTER_NAME_LINK_REGEX: compiled regex
    """

    IGNORE_METADATA_KEYS = ("output_filename", "zchapters")
    CHAPTER_NAME_REGEX = re.compile("^section[0-9]+$")
    CHAPTER_NAME_LINK_REGEX = re.compile("#section[0-9]+")

    def __init__(self, path, include_images=True):
        self.path = path
        self.include_images = include_images
        self.soup = None
        self.story = None
        self.config = None

    def get_soup(self):
        """
        Return the content soup, generating it if neccessary.
        
        @return: the soup, possibly cached
        @rtype: L{bs4.BeautifulSoup}
        """
        if self.soup is None:
            sp = os.path.join(self.path, "story.html")
            with open(sp, "r") as fin:
                content = fin.read()
            self.soup = bs4.BeautifulSoup(content, "html.parser")
        return self.soup

    def get_story(self):
        """
        Return the fanficfare story, instancing it if neccessary.
        
        @return: the story, possibly cached
        @rtype: L{fanficfare.story.Story}
        """
        if self.story is None:
            conf = self.get_config()
            self.story = Story(conf)
        return self.story

    def get_config(self):
        """
        Return the fanficfare configuration, creating it if neccessary.
        
        @return: the configuration, possibly cached
        @rtype: L{fanficfare.configurable.Configuration}
        """
        if self.config is None:
            self.config = Configuration(["zimhtml"], "epub")
            try:
                self.config.add_section('overrides')
            except configparser.DuplicateSectionError:
                # generally already exists in defaults.ini
                pass
            if self.include_images:
                self.config.set("overrides", "include_images", "true")
        return self.config

    def parse_metadata(self):
        """
        Read and set the metadata of the story.
        """
        story = self.get_story()
        mp = os.path.join(self.path, "metadata.json")
        with open(mp, "r") as fin:
            content = json.load(fin)
        for key in content.keys():
            # check if key is blacklisted
            if key in self.IGNORE_METADATA_KEYS:
                continue

            value = content[key]

            # parse values if neccessary
            if key == "dateCreated":
                value = datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
            elif key in ("datePublished", "dateUpdated"):
                for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"):
                    try:
                        value = datetime.datetime.strptime(value, fmt)
                        break
                    except ValueError:
                        continue
                else:
                    # invalid format
                    value = datetime.datetime(1, 1, 1)

            story.setMetadata(key, value)

    def add_images(self):
        """
        Add the images to the URL.
        """
        soup = self.get_soup()
        story = self.get_story()

        all_images = soup.find_all("img")
        for imgtag in all_images:
            if imgtag.get("alt") == "cover":
                is_cover = True
            else:
                is_cover = False
            img_url = imgtag["src"]
            # fanficfare does not like 'ffdl-' present in the URL, replace it with a placeholder
            sub_img_url = img_url.replace("ffdl-", FFDL_PLACEHOLDER)
            newsrc, imgurl = story.addImgUrl("file://{}/story.html".format(
                self.path),
                                             sub_img_url,
                                             self.fetch_image,
                                             cover=is_cover)
            # rewrite image tag
            imgtag["src"] = newsrc
            if not imgtag.has_attr("longdesc"):
                imgtag["longdesc"] = imgurl

    def fetch_image(self, url, **kwargs):
        """
        This method is cally by the story to fetch the images.
        
        @param url: url to fetch
        @type url: L{str}
        """
        assert url.startswith("file://")
        url = url.replace("file://", "")
        # fanficfare does not like 'ffdl-' present in the URL, we replaced it with a placeholder earlier
        url = url.replace(FFDL_PLACEHOLDER, "ffdl-")
        with open(url, "rb") as fin:
            data = fin.read()
        return data

    def parse_chapter_contents(self):
        """
        Parse the chapter contents.
        """
        soup = self.get_soup()
        story = self.get_story()
        # chapter_title_matches = soup.find_all("a", href=self.CHAPTER_NAME_LINK_REGEX)
        # all_chapter_names = [e.text for e in chapter_title_matches]
        # all_storytexts = soup.find_all(id="storytextp")
        first_title_tag = soup.find("a", {"name": self.CHAPTER_NAME_REGEX})
        all_chapter_names = [first_title_tag.text]
        all_storytexts = []
        for tag in first_title_tag.next_siblings:
            if tag.name == "a":
                all_chapter_names.append(tag.text)
            elif tag.name == "div":
                all_storytexts.append(str(tag))
            #else:
            #    raise Exception("Unknown HTML tag on content-root-level of story: '{}'!".format(tag))

        for title, html in zip(all_chapter_names, all_storytexts):
            story.addChapter({
                "title": title,
                "html": html,
            }, )

    def parse(self):
        """
        Parse the whole input story.
        """
        self.parse_metadata()
        self.add_images()  # <-- must be before parse_chapter_contents()
        self.parse_chapter_contents()

    def write(self, path=None):
        """
        Write the fanfic to an epub.
        
        @param path: path to write to
        @type path: L{str}
        """
        if path is None:
            path = os.path.join(self.path, "story.epub")
        config = self.get_config()
        logging.getLogger("fanficfare").setLevel(logging.WARNING)
        writer = writers.getWriter("epub", config, self)
        writer.writeStory(
            outfilename=path,
            metaonly=False,
            forceOverwrite=True,
        )

    def getStory(self):
        """
        Return the story. Part of the fake adapter api.
        
        @return: the story
        @rtype: L{fanficfare.story.Story}
        """
        return self.get_story()

    def getStoryMetadataOnly(self):
        """
        Return the story. Part of the fake adapter api.
        
        @return: the story
        @rtype: L{fanficfare.story.Story}
        """
        # We implement no difference from getStory()
        return self.getStory()