def getUserConfig(self,user,url,fileformat): configuration = Configuration(adapters.getConfigSectionsFor(url),fileformat) logging.debug('reading defaults.ini config file') configuration.read('fanficfare/defaults.ini') ## Pull user's config record. l = UserConfig.all().filter('user ='******'reading config from UserConfig(%s)'%uconfig.config) configuration.readfp(StringIO(uconfig.config)) return configuration
def setup_method(self): self.url = 'https://chireads.com/category/translatedtales/some-story/' self.chapter_url = 'https://chireads.com/translatedtales/chapitre-1-some-title/2020/02/08/' self.configuration = Configuration(["chireads.com"], "EPUB", lightweight=True) self.chireads = chiread(self.configuration, self.url)
def setup_method(self, adapter, url, sections, path_adapter, adapter_fixture): self.url = url self.configuration = Configuration(sections, "EPUB", lightweight=True) self.adapter = adapter(self.configuration, self.url) self.path_adapter = path_adapter self.fixture = adapter_fixture
def get_urls_from_text(data,configuration=None,normalize=False): urls = collections.OrderedDict() try: data = unicode(data) except UnicodeDecodeError: data=data.decode('utf8') ## for when called outside calibre. if not configuration: configuration = Configuration(["test1.com"],"EPUB",lightweight=True) for href in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data): # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. if 'story.php' in href: m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",href) if m != None: href = form_url(href,m.group('sid')) try: href = href.replace('&index=1','') adapter = adapters.getAdapter(configuration,href) if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except: pass # Simply return the longest URL with the assumption that it contains the # most user readable metadata, if not normalized return urls.keys() if normalize else [max(value, key=len) for key, value in urls.items()]
def get_config(self): """ Return the fanficfare configuration, creating it if neccessary. @return: the configuration, possibly cached @rtype: L{fanficfare.configurable.Configuration} """ if self.config is None: self.config = Configuration(["zimhtml"], "epub") try: self.config.add_section('overrides') except configparser.DuplicateSectionError: # generally already exists in defaults.ini pass if self.include_images: self.config.set("overrides", "include_images", "true") return self.config
def fetch_metadata(url: str, chapters=True) -> bytes: configuration = Configuration(adapters.getConfigSectionsFor(url), 'epub') adapter = adapters.getAdapter(configuration, url) adapter.is_adult = True metadata = adapter.getStoryMetadataOnly().getAllMetadata() if chapters: metadata['zchapters'] = [] for i, chap in enumerate(adapter.get_chapters()): metadata['zchapters'].append((i + 1, chap)) return metadata
def __init__(self, url): if isinstance(url, Target): url = url.url self.url = url configuration = Configuration(["test1.com"], "HTML", lightweight=True) try: adapter = adapters.getAdapter(configuration, url) except UnknownSite: raise NotAValidTarget(url) self.abbrev = adapter.story.getMetadata("siteabbrev") if self.abbrev is None: self.abbrev = "unknown" self.id = adapter.story.getMetadata("storyId") if self.id is None: self.id = self._id_from_url(url)
def from_url(url: HttpUrl): fff_works = False try: configuration = Configuration(adapters.getConfigSectionsFor(str(url)), 'epub') fff_works = True except exceptions.UnknownSite: pass generator = mapping.get(url.host) if generator: return generator elif fff_works: return fanficfare_generator raise Exception()
def get_urls_from_page(url,configuration=None,normalize=False): if not configuration: configuration = Configuration(["test1.com"],"EPUB",lightweight=True) data = None adapter = None try: adapter = adapters.getAdapter(configuration,url,anyurl=True) # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if 'archiveofourown.org' in url: if adapter.getConfig("username"): if adapter.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl="" # just to get an authenticity_token. data = adapter._fetchUrl(url+addurl) # login the session. adapter.performLogin(url,data) # get the list page with logged in session. if 'fimfiction.net' in url and adapter.getConfig("is_adult"): data = adapter._fetchUrl(url) adapter.set_adult_cookie() # this way it uses User-Agent or other special settings. Only AO3 # is doing login. data = adapter._fetchUrl(url,usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. restrictsearch=None if 'scarvesandcoffee.net' in url: restrictsearch=('div',{'id':'mainpage'}) return get_urls_from_html(data,url,configuration,normalize,restrictsearch)
def getNormalStoryURLSite(url): # print("getNormalStoryURLSite:%s"%url) if not adapters.gerNormalStoryURL.__dummyconfig: adapters.getNormalStoryURL.__dummyconfig = Configuration( ["test1.com"], "EPUB", lightweight=True) # pulling up an adapter is pretty low over-head. If # it fails, it's a bad url. try: adapter = adapters.getAdapter( adapters.getNormalStoryURL.__dummyconfig, url) url = adapter.url site = adapter.getSiteDomain() storyid = adapter.story.getMetadata('storyId') del adapter return (url, site, storyid) except: return None
def get_fff_config(url,fileform="epub",personalini=None): if not personalini: personalini = get_fff_personalini() sections=['unknown'] try: sections = adapters.getConfigSectionsFor(url) except Exception as e: logger.debug("Failed trying to get ini config for url(%s): %s, using section %s instead"%(url,e,sections)) configuration = Configuration(sections,fileform) configuration.readfp(StringIO(ensure_text(get_resources("plugin-defaults.ini")))) configuration.readfp(StringIO(ensure_text(personalini))) return configuration
def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrictsearch=None): urls = collections.OrderedDict() if not configuration: configuration = Configuration(["test1.com"],"EPUB",lightweight=True) soup = BeautifulSoup(data,"html5lib") if restrictsearch: soup = soup.find(*restrictsearch) #logger.debug("restrict search:%s"%soup) for a in soup.findAll('a'): if a.has_attr('href'): #logger.debug("a['href']:%s"%a['href']) href = form_url(url,a['href']) #logger.debug("1 urlhref:%s"%href) # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. if 'story.php' in a['href']: #logger.debug("trying:%s"%a['href']) m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",a['href']) if m != None: href = form_url(a['href'] if '//' in a['href'] else url, m.group('sid')) try: href = href.replace('&index=1','') #logger.debug("2 urlhref:%s"%href) adapter = adapters.getAdapter(configuration,href) #logger.debug("found adapter") if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except Exception, e: #logger.debug e pass
def get_configuration(url, passed_defaultsini, passed_personalini, options, chaptercount=None, output_filename=None): try: configuration = Configuration(adapters.getConfigSectionsFor(url), options.format) except exceptions.UnknownSite as e: if options.list or options.normalize or options.downloadlist: # list for page doesn't have to be a supported site. configuration = Configuration(['unknown'], options.format) else: raise conflist = [] homepath = join(expanduser('~'), '.fanficdownloader') ## also look for .fanficfare now, give higher priority than old dir. homepath2 = join(expanduser('~'), '.fanficfare') xdgpath = os.environ.get('XDG_CONFIG_HOME', join(expanduser('~'), '.config')) xdgpath = join(xdgpath, 'fanficfare') if passed_defaultsini: # new StringIO each time rather than pass StringIO and rewind # for case of list download. Just makes more sense to me. configuration.readfp(StringIO(unicode(passed_defaultsini))) else: # don't need to check existance for our selves. conflist.append(join(dirname(__file__), 'defaults.ini')) conflist.append(join(homepath, 'defaults.ini')) conflist.append(join(homepath2, 'defaults.ini')) conflist.append(join(xdgpath, 'defaults.ini')) conflist.append('defaults.ini') if passed_personalini: # new StringIO each time rather than pass StringIO and rewind # for case of list download. Just makes more sense to me. configuration.readfp(StringIO(unicode(passed_personalini))) conflist.append(join(homepath, 'personal.ini')) conflist.append(join(homepath2, 'personal.ini')) conflist.append(join(xdgpath, 'personal.ini')) conflist.append('personal.ini') if options.configfile: conflist.extend(options.configfile) configuration.read(conflist) try: configuration.add_section('overrides') except configparser.DuplicateSectionError: # generally already exists in defaults.ini pass if options.force: configuration.set('overrides', 'always_overwrite', 'true') if options.update and chaptercount and output_filename: configuration.set('overrides', 'output_filename', output_filename) if options.update and not options.updatecover: configuration.set('overrides', 'never_make_cover', 'true') # images only for epub, even if the user mistakenly turned it # on else where. if options.format not in ('epub', 'html'): configuration.set('overrides', 'include_images', 'false') if options.options: for opt in options.options: (var, val) = opt.split('=') configuration.set('overrides', var, val) if options.progressbar: configuration.set('overrides', 'progressbar', 'true') ## do page cache and cookie load after reading INI files because ## settings (like use_basic_cache) matter. ## only need browser cache if one of the URLs needs it, and it ## isn't saved or dependent on options.save_cache. This needs to ## be above basic_cache to avoid loading more than once anyway. if configuration.getConfig('use_browser_cache'): if not hasattr(options, 'browser_cache'): configuration.get_fetcher() # force browser cache read. options.browser_cache = configuration.get_browser_cache() else: configuration.set_browser_cache(options.browser_cache) ## Share basic_cache between multiple downloads. if not hasattr(options, 'basic_cache'): options.basic_cache = configuration.get_basic_cache() if options.save_cache: try: options.basic_cache.load_cache(global_cache) except Exception as e: logger.warning( "Didn't load --save-cache %s\nContinue without loading BasicCache" % e) options.basic_cache.set_autosave(True, filename=global_cache) else: configuration.set_basic_cache(options.basic_cache) # logger.debug(options.basic_cache.basic_cache.keys()) ## All CLI downloads are sequential and share one cookiejar, ## loaded the first time through here. if not hasattr(options, 'cookiejar'): options.cookiejar = configuration.get_cookiejar() if options.save_cache: try: options.cookiejar.load_cookiejar(global_cookies) except Exception as e: logger.warning( "Didn't load --save-cache %s\nContinue without loading cookies" % e) options.cookiejar.set_autosave(True, filename=global_cookies) else: configuration.set_cookiejar(options.cookiejar) return configuration
def setup_method(self): self.url = 'https://chireads.com/category/translatedtales/some-story/' self.configuration = Configuration(["chireads.com"], "EPUB", lightweight=True) self.chireads = chiread(self.configuration, self.url)
class Html2EpubConverter(object): """ The HTML to EPUB converter. @param path: path to dir containing story and metadata @type path: L{str} @param include_images: if nonzero, include images in epub @type include_images: L{bool} @cvar IGNORE_METADATA_KEYS: tuple of metadata keys to ignore @type IGNORE_METADATA_KEYS: L{tuple} of L{str} @cvar CHAPTER_NAME_LINK_REGEX: regex to use to identify hrefs of chapter links @type CHAPTER_NAME_LINK_REGEX: compiled regex """ IGNORE_METADATA_KEYS = ("output_filename", "zchapters") CHAPTER_NAME_REGEX = re.compile("^section[0-9]+$") CHAPTER_NAME_LINK_REGEX = re.compile("#section[0-9]+") def __init__(self, path, include_images=True): self.path = path self.include_images = include_images self.soup = None self.story = None self.config = None def get_soup(self): """ Return the content soup, generating it if neccessary. @return: the soup, possibly cached @rtype: L{bs4.BeautifulSoup} """ if self.soup is None: sp = os.path.join(self.path, "story.html") with open(sp, "r") as fin: content = fin.read() self.soup = bs4.BeautifulSoup(content, "html.parser") return self.soup def get_story(self): """ Return the fanficfare story, instancing it if neccessary. @return: the story, possibly cached @rtype: L{fanficfare.story.Story} """ if self.story is None: conf = self.get_config() self.story = Story(conf) return self.story def get_config(self): """ Return the fanficfare configuration, creating it if neccessary. @return: the configuration, possibly cached @rtype: L{fanficfare.configurable.Configuration} """ if self.config is None: self.config = Configuration(["zimhtml"], "epub") try: self.config.add_section('overrides') except configparser.DuplicateSectionError: # generally already exists in defaults.ini pass if self.include_images: self.config.set("overrides", "include_images", "true") return self.config def parse_metadata(self): """ Read and set the metadata of the story. """ story = self.get_story() mp = os.path.join(self.path, "metadata.json") with open(mp, "r") as fin: content = json.load(fin) for key in content.keys(): # check if key is blacklisted if key in self.IGNORE_METADATA_KEYS: continue value = content[key] # parse values if neccessary if key == "dateCreated": value = datetime.datetime.strptime(value, "%Y-%m-%d %H:%M:%S") elif key in ("datePublished", "dateUpdated"): for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"): try: value = datetime.datetime.strptime(value, fmt) break except ValueError: continue else: # invalid format value = datetime.datetime(1, 1, 1) story.setMetadata(key, value) def add_images(self): """ Add the images to the URL. """ soup = self.get_soup() story = self.get_story() all_images = soup.find_all("img") for imgtag in all_images: if imgtag.get("alt") == "cover": is_cover = True else: is_cover = False img_url = imgtag["src"] # fanficfare does not like 'ffdl-' present in the URL, replace it with a placeholder sub_img_url = img_url.replace("ffdl-", FFDL_PLACEHOLDER) newsrc, imgurl = story.addImgUrl("file://{}/story.html".format( self.path), sub_img_url, self.fetch_image, cover=is_cover) # rewrite image tag imgtag["src"] = newsrc if not imgtag.has_attr("longdesc"): imgtag["longdesc"] = imgurl def fetch_image(self, url, **kwargs): """ This method is cally by the story to fetch the images. @param url: url to fetch @type url: L{str} """ assert url.startswith("file://") url = url.replace("file://", "") # fanficfare does not like 'ffdl-' present in the URL, we replaced it with a placeholder earlier url = url.replace(FFDL_PLACEHOLDER, "ffdl-") with open(url, "rb") as fin: data = fin.read() return data def parse_chapter_contents(self): """ Parse the chapter contents. """ soup = self.get_soup() story = self.get_story() # chapter_title_matches = soup.find_all("a", href=self.CHAPTER_NAME_LINK_REGEX) # all_chapter_names = [e.text for e in chapter_title_matches] # all_storytexts = soup.find_all(id="storytextp") first_title_tag = soup.find("a", {"name": self.CHAPTER_NAME_REGEX}) all_chapter_names = [first_title_tag.text] all_storytexts = [] for tag in first_title_tag.next_siblings: if tag.name == "a": all_chapter_names.append(tag.text) elif tag.name == "div": all_storytexts.append(str(tag)) #else: # raise Exception("Unknown HTML tag on content-root-level of story: '{}'!".format(tag)) for title, html in zip(all_chapter_names, all_storytexts): story.addChapter({ "title": title, "html": html, }, ) def parse(self): """ Parse the whole input story. """ self.parse_metadata() self.add_images() # <-- must be before parse_chapter_contents() self.parse_chapter_contents() def write(self, path=None): """ Write the fanfic to an epub. @param path: path to write to @type path: L{str} """ if path is None: path = os.path.join(self.path, "story.epub") config = self.get_config() logging.getLogger("fanficfare").setLevel(logging.WARNING) writer = writers.getWriter("epub", config, self) writer.writeStory( outfilename=path, metaonly=False, forceOverwrite=True, ) def getStory(self): """ Return the story. Part of the fake adapter api. @return: the story @rtype: L{fanficfare.story.Story} """ return self.get_story() def getStoryMetadataOnly(self): """ Return the story. Part of the fake adapter api. @return: the story @rtype: L{fanficfare.story.Story} """ # We implement no difference from getStory() return self.getStory()