def get_urls_from_text(data,configuration=None,normalize=False): urls = collections.OrderedDict() try: data = unicode(data) except UnicodeDecodeError: data=data.decode('utf8') ## for when called outside calibre. if not configuration: configuration = Configuration(["test1.com"],"EPUB",lightweight=True) for href in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', data): # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. if 'story.php' in href: m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",href) if m != None: href = form_url(href,m.group('sid')) try: href = href.replace('&index=1','') adapter = adapters.getAdapter(configuration,href) if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except: pass # Simply return the longest URL with the assumption that it contains the # most user readable metadata, if not normalized return urls.keys() if normalize else [max(value, key=len) for key, value in urls.items()]
def fetch_metadata(url: str, chapters=True) -> bytes: configuration = Configuration(adapters.getConfigSectionsFor(url), 'epub') adapter = adapters.getAdapter(configuration, url) adapter.is_adult = True metadata = adapter.getStoryMetadataOnly().getAllMetadata() if chapters: metadata['zchapters'] = [] for i, chap in enumerate(adapter.get_chapters()): metadata['zchapters'].append((i + 1, chap)) return metadata
def __init__(self, url): if isinstance(url, Target): url = url.url self.url = url configuration = Configuration(["test1.com"], "HTML", lightweight=True) try: adapter = adapters.getAdapter(configuration, url) except UnknownSite: raise NotAValidTarget(url) self.abbrev = adapter.story.getMetadata("siteabbrev") if self.abbrev is None: self.abbrev = "unknown" self.id = adapter.story.getMetadata("storyId") if self.id is None: self.id = self._id_from_url(url)
def get_urls_from_page(url,configuration=None,normalize=False): if not configuration: configuration = Configuration(["test1.com"],"EPUB",lightweight=True) data = None adapter = None try: adapter = adapters.getAdapter(configuration,url,anyurl=True) # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if 'archiveofourown.org' in url: if adapter.getConfig("username"): if adapter.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl="" # just to get an authenticity_token. data = adapter._fetchUrl(url+addurl) # login the session. adapter.performLogin(url,data) # get the list page with logged in session. if 'fimfiction.net' in url and adapter.getConfig("is_adult"): data = adapter._fetchUrl(url) adapter.set_adult_cookie() # this way it uses User-Agent or other special settings. Only AO3 # is doing login. data = adapter._fetchUrl(url,usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. restrictsearch=None if 'scarvesandcoffee.net' in url: restrictsearch=('div',{'id':'mainpage'}) return get_urls_from_html(data,url,configuration,normalize,restrictsearch)
def getNormalStoryURLSite(url): # print("getNormalStoryURLSite:%s"%url) if not adapters.gerNormalStoryURL.__dummyconfig: adapters.getNormalStoryURL.__dummyconfig = Configuration( ["test1.com"], "EPUB", lightweight=True) # pulling up an adapter is pretty low over-head. If # it fails, it's a bad url. try: adapter = adapters.getAdapter( adapters.getNormalStoryURL.__dummyconfig, url) url = adapter.url site = adapter.getSiteDomain() storyid = adapter.story.getMetadata('storyId') del adapter return (url, site, storyid) except: return None
def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrictsearch=None): urls = collections.OrderedDict() if not configuration: configuration = Configuration(["test1.com"],"EPUB",lightweight=True) soup = BeautifulSoup(data,"html5lib") if restrictsearch: soup = soup.find(*restrictsearch) #logger.debug("restrict search:%s"%soup) for a in soup.findAll('a'): if a.has_attr('href'): #logger.debug("a['href']:%s"%a['href']) href = form_url(url,a['href']) #logger.debug("1 urlhref:%s"%href) # this (should) catch normal story links, some javascript # 'are you old enough' links, and 'Report This' links. if 'story.php' in a['href']: #logger.debug("trying:%s"%a['href']) m = re.search(r"(?P<sid>(view)?story\.php\?(sid|psid|no|story|stid)=\d+)",a['href']) if m != None: href = form_url(a['href'] if '//' in a['href'] else url, m.group('sid')) try: href = href.replace('&index=1','') #logger.debug("2 urlhref:%s"%href) adapter = adapters.getAdapter(configuration,href) #logger.debug("found adapter") if adapter.story.getMetadata('storyUrl') not in urls: urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) except Exception, e: #logger.debug e pass
if options.force: configuration.set('overrides', 'always_overwrite', 'true') if options.options: for opt in options.options: (var, val) = opt.split('=') configuration.set('overrides', var, val) if options.list or options.normalize: retlist = get_urls_from_page(url, configuration, normalize=options.normalize) return '\n'.join(retlist) try: adapter = adapters.getAdapter(configuration, url) #adapter.setChaptersRange(options.begin, options.end) # three tries, that's enough if both user/pass & is_adult needed, # or a couple tries of one or the other for x in range(0, 2): try: print('XXX in two-1') adapter.getStoryMetadataOnly() print('XXX in two-2') except exceptions.FailedToLogin, f: if f.passwdonly: print 'Story requires a password.' else: print 'Login Failed, Need Username/Password.' sys.stdout.write('Username: ') adapter.username = sys.stdin.readline().strip()
def post(self): logging.getLogger().setLevel(logging.DEBUG) fileId = self.request.get('id') # User object can't pass, just email address user = users.User(self.request.get('user')) format = self.request.get('format') url = self.request.get('url') login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') email = self.request.get('email') logging.info("Downloading: " + url + " for user: "******"ID: " + fileId) adapter = None writerClass = None # use existing record if available. # fileId should have record from /fdown. download = getDownloadMeta(id=fileId,url=url,user=user,format=format,new=True) for chunk in download.data_chunks: chunk.delete() download.put() logging.info('Creating adapter...') try: configuration = self.getUserConfig(user,url,format) adapter = adapters.getAdapter(configuration,url) adapter.setChaptersRange(download.ch_begin,download.ch_end) logging.info('Created an adapter: %s' % adapter) if login or password: adapter.username=login adapter.password=password adapter.is_adult=is_adult # adapter.getStory() is what does all the heavy lifting. # adapter.getStoryMetadataOnly() only fetches enough to # get metadata. writer.writeStory() will call # adapter.getStory(), too. writer = writers.getWriter(format,configuration,adapter) download.name = writer.getOutputFileName() #logging.debug('output_filename:'+writer.getConfig('output_filename')) logging.debug('getOutputFileName:'+writer.getOutputFileName()) download.title = adapter.getStory().getMetadata('title') download.author = adapter.getStory().getMetadata('author') download.url = adapter.getStory().getMetadata('storyUrl') download.put() allmeta = adapter.getStory().getAllMetadata(removeallentities=True,doreplacements=False) outbuffer = StringIO() writer.writeStory(outbuffer) data = outbuffer.getvalue() outbuffer.close() del outbuffer #del writer.adapter #del writer.story del writer #del adapter.story del adapter # logging.debug("Email: %s"%email) # if email and re.match(r"^[^@]+@[^@]+", email): # try: # logging.info("Email Attempt") # send_mail_attachment(user.email(), # email.strip(), # download.title + " by " + download.author, # download.title + " by " + download.author + " URL: "+download.url, # download.name, # data) # logging.info("Email Sent") # except Exception as e: # # download.failure = "Failed to send Email %s"%unicode(e) # logging.warn(e, exc_info=True) # epubs are all already compressed. Each chunk is # compressed individually to avoid having to hold the # whole in memory just for the compress/uncompress. if format != 'epub': def compress(data): return zlib.compress(data) else: def compress(data): return data # delete existing chunks first for chunk in download.data_chunks: chunk.delete() index=0 while( len(data) > 0 ): # logging.info("len(data): %s" % len(data)) DownloadData(download=download, index=index, blob=compress(data[:1000000])).put() index += 1 data = data[1000000:] download.completed=True download.put() smetal = SavedMeta.all().filter('url =', allmeta['storyUrl'] ).fetch(1) if smetal and smetal[0]: smeta = smetal[0] smeta.count += 1 else: smeta=SavedMeta() smeta.count = 1 smeta.url = allmeta['storyUrl'] smeta.title = allmeta['title'] smeta.author = allmeta['author'] smeta.meta = allmeta smeta.date = datetime.datetime.now() smeta.put() logging.info("Download finished OK") del data except Exception as e: logging.exception(e) download.failure = unicode(e) download.put() return return
def post(self): logging.getLogger().setLevel(logging.DEBUG) user = users.get_current_user() if not user: self.redirect(users.create_login_url(self.request.uri)) return format = self.request.get('format') url = self.request.get('url') if not url or url.strip() == "": self.redirect('/') return # Allow chapter range with URL. # like test1.com?sid=5[4-6] or [4,6] url,ch_begin,ch_end = adapters.get_url_chapter_range(url) logging.info("Queuing Download: %s" % url) login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') == "on" email = self.request.get('email') # use existing record if available. Fetched/Created before # the adapter can normalize the URL in case we need to record # an exception. download = getDownloadMeta(url=url,user=user,format=format,new=True) adapter = None try: try: configuration = self.getUserConfig(user,url,format) except exceptions.UnknownSite: self.redirect("/?error=custom&errtext=%s"%urllib.quote("Unsupported site in URL (%s). See 'Support sites' list below."%url,'')) return except Exception as e: self.redirect("/?error=custom&errtext=%s"%urllib.quote("There's an error in your User Configuration: "+unicode(e),'')[:2048]) # limited due to Locatton header length limit. return adapter = adapters.getAdapter(configuration,url) adapter.setChaptersRange(ch_begin,ch_end) logging.info('Created an adaper: %s' % adapter) if login or password: adapter.username=login adapter.password=password adapter.is_adult=is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc ## without waiting for the queue. So I think ## it's worth the double up. Could maybe save ## it all in the download object someday. story = adapter.getStoryMetadataOnly() ## Fetch again using normalized story URL. The one ## fetched/created above, if different, will not be saved. download = getDownloadMeta(url=story.getMetadata('storyUrl'), user=user,format=format,new=True) download.title = story.getMetadata('title') download.author = story.getMetadata('author') download.url = story.getMetadata('storyUrl') download.ch_begin = ch_begin download.ch_end = ch_end download.put() taskqueue.add(url='/fdowntask', queue_name="download", params={'id':unicode(download.key()), 'format':format, 'url':download.url, 'login':login, 'password':password, 'user':user.email(), 'email':email, 'is_adult':is_adult}) logging.info("enqueued download key: " + unicode(download.key())) except (exceptions.FailedToLogin,exceptions.AdultCheckRequired), e: download.failure = unicode(e) download.put() logging.info(unicode(e)) is_login= ( isinstance(e, exceptions.FailedToLogin) ) is_passwdonly = is_login and e.passwdonly template_values = dict(nickname = user.nickname(), url = url, format = format, site = adapter.getConfigSection(), fic = download, is_login=is_login, is_passwdonly=is_passwdonly ) # thewriterscoffeeshop.com can do adult check *and* user required. if isinstance(e,exceptions.AdultCheckRequired): template_values['login']=login template_values['password']=password path = os.path.join(os.path.dirname(__file__), 'login.html') self.response.out.write(template.render(path, template_values)) return
def post(self): logging.getLogger().setLevel(logging.DEBUG) user = users.get_current_user() if not user: self.redirect(users.create_login_url(self.request.uri)) return format = self.request.get('format') url = self.request.get('url') if not url or url.strip() == "": self.redirect('/') return # Allow chapter range with URL. # test1.com?sid=5[4-6] mc = re.match(r"^(?P<url>.*?)(?:\[(?P<begin>\d+)?(?P<comma>[,-])?(?P<end>\d+)?\])?$",url) #print("url:(%s) begin:(%s) end:(%s)"%(mc.group('url'),mc.group('begin'),mc.group('end'))) url = mc.group('url') ch_begin = mc.group('begin') ch_end = mc.group('end') if ch_begin and not mc.group('comma'): ch_end = ch_begin logging.info("Queuing Download: %s" % url) login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') == "on" # use existing record if available. Fetched/Created before # the adapter can normalize the URL in case we need to record # an exception. download = getDownloadMeta(url=url,user=user,format=format,new=True) adapter = None try: try: configuration = self.getUserConfig(user,url,format) except exceptions.UnknownSite: self.redirect("/?error=custom&errtext=%s"%urllib.quote("Unsupported site in URL (%s). See 'Support sites' list below."%url,'')) return except Exception, e: self.redirect("/?error=custom&errtext=%s"%urllib.quote("There's an error in your User Configuration: "+unicode(e),'')[:2048]) # limited due to Locatton header length limit. return adapter = adapters.getAdapter(configuration,url) adapter.setChaptersRange(ch_begin,ch_end) logging.info('Created an adaper: %s' % adapter) if login or password: adapter.username=login adapter.password=password adapter.is_adult=is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc ## without waiting for the queue. So I think ## it's worth the double up. Could maybe save ## it all in the download object someday. story = adapter.getStoryMetadataOnly() ## Fetch again using normalized story URL. The one ## fetched/created above, if different, will not be saved. download = getDownloadMeta(url=story.getMetadata('storyUrl'), user=user,format=format,new=True) download.title = story.getMetadata('title') download.author = story.getMetadata('author') download.url = story.getMetadata('storyUrl') download.ch_begin = ch_begin download.ch_end = ch_end download.put() taskqueue.add(url='/fdowntask', queue_name="download", params={'id':unicode(download.key()), 'format':format, 'url':download.url, 'login':login, 'password':password, 'user':user.email(), 'is_adult':is_adult}) logging.info("enqueued download key: " + unicode(download.key()))
def post(self): logging.getLogger().setLevel(logging.DEBUG) user = users.get_current_user() if not user: self.redirect(users.create_login_url(self.request.uri)) return format = self.request.get('format') url = self.request.get('url') if not url or url.strip() == "": self.redirect('/') return logging.info("Queuing Download: %s" % url) login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') == "on" # use existing record if available. Fetched/Created before # the adapter can normalize the URL in case we need to record # an exception. download = getDownloadMeta(url=url,user=user,format=format,new=True) adapter = None try: try: configuration = self.getUserConfig(user,url,format) except Exception, e: self.redirect("/?error=custom&errtext=%s"%urlEscape("There's an error in your User Configuration: "+str(e))) return adapter = adapters.getAdapter(configuration,url) logging.info('Created an adaper: %s' % adapter) if login or password: adapter.username=login adapter.password=password adapter.is_adult=is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc ## without waiting for the queue. So I think ## it's worth the double up. Could maybe save ## it all in the download object someday. story = adapter.getStoryMetadataOnly() ## Fetch again using normalized story URL. The one ## fetched/created above, if different, will not be saved. download = getDownloadMeta(url=story.getMetadata('storyUrl'), user=user,format=format,new=True) download.title = story.getMetadata('title') download.author = story.getMetadata('author') download.url = story.getMetadata('storyUrl') download.put() taskqueue.add(url='/fdowntask', queue_name="download", params={'id':str(download.key()), 'format':format, 'url':download.url, 'login':login, 'password':password, 'user':user.email(), 'is_adult':is_adult}) logging.info("enqueued download key: " + str(download.key()))
def get_fff_adapter(url,fileform="epub",personalini=None): return adapters.getAdapter(get_fff_config(url,fileform,personalini),url)
def post(self): logging.getLogger().setLevel(logging.DEBUG) user = users.get_current_user() if not user: self.redirect(users.create_login_url(self.request.uri)) return format = self.request.get('format') url = self.request.get('url') if not url or url.strip() == "": self.redirect('/') return # Allow chapter range with URL. # like test1.com?sid=5[4-6] or [4,6] url, ch_begin, ch_end = adapters.get_url_chapter_range(url) logging.info("Queuing Download: %s" % url) login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') == "on" email = self.request.get('email') # use existing record if available. Fetched/Created before # the adapter can normalize the URL in case we need to record # an exception. download = getDownloadMeta(url=url, user=user, format=format, new=True) adapter = None try: try: configuration = self.getUserConfig(user, url, format) except exceptions.UnknownSite: self.redirect("/?error=custom&errtext=%s" % urllib.quote( "Unsupported site in URL (%s). See 'Support sites' list below." % url, '')) return except Exception as e: self.redirect("/?error=custom&errtext=%s" % urllib.quote( "There's an error in your User Configuration: " + unicode(e), '')[:2048]) # limited due to Locatton header length limit. return adapter = adapters.getAdapter(configuration, url) adapter.setChaptersRange(ch_begin, ch_end) logging.info('Created an adaper: %s' % adapter) if login or password: adapter.username = login adapter.password = password adapter.is_adult = is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc ## without waiting for the queue. So I think ## it's worth the double up. Could maybe save ## it all in the download object someday. story = adapter.getStoryMetadataOnly() ## Fetch again using normalized story URL. The one ## fetched/created above, if different, will not be saved. download = getDownloadMeta(url=story.getMetadata('storyUrl'), user=user, format=format, new=True) download.title = story.getMetadata('title') download.author = story.getMetadata('author') download.url = story.getMetadata('storyUrl') download.ch_begin = ch_begin download.ch_end = ch_end download.put() taskqueue.add(url='/fdowntask', queue_name="download", params={ 'id': unicode(download.key()), 'format': format, 'url': download.url, 'login': login, 'password': password, 'user': user.email(), 'email': email, 'is_adult': is_adult }) logging.info("enqueued download key: " + unicode(download.key())) except (exceptions.FailedToLogin, exceptions.AdultCheckRequired), e: download.failure = unicode(e) download.put() logging.info(unicode(e)) is_login = (isinstance(e, exceptions.FailedToLogin)) is_passwdonly = is_login and e.passwdonly template_values = dict(nickname=user.nickname(), url=url, format=format, site=adapter.getConfigSection(), fic=download, is_login=is_login, is_passwdonly=is_passwdonly) # thewriterscoffeeshop.com can do adult check *and* user required. if isinstance(e, exceptions.AdultCheckRequired): template_values['login'] = login template_values['password'] = password path = os.path.join(os.path.dirname(__file__), 'login.html') self.response.out.write(template.render(path, template_values)) return
def post(self): logging.getLogger().setLevel(logging.DEBUG) user = users.get_current_user() if not user: self.redirect(users.create_login_url(self.request.uri)) return format = self.request.get('format') url = self.request.get('url') if not url or url.strip() == "": self.redirect('/') return # Allow chapter range with URL. # test1.com?sid=5[4-6] mc = re.match( r"^(?P<url>.*?)(?:\[(?P<begin>\d+)?(?P<comma>[,-])?(?P<end>\d+)?\])?$", url) #print("url:(%s) begin:(%s) end:(%s)"%(mc.group('url'),mc.group('begin'),mc.group('end'))) url = mc.group('url') ch_begin = mc.group('begin') ch_end = mc.group('end') if ch_begin and not mc.group('comma'): ch_end = ch_begin logging.info("Queuing Download: %s" % url) login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') == "on" # use existing record if available. Fetched/Created before # the adapter can normalize the URL in case we need to record # an exception. download = getDownloadMeta(url=url, user=user, format=format, new=True) adapter = None try: try: configuration = self.getUserConfig(user, url, format) except Exception, e: self.redirect( "/?error=custom&errtext=%s" % urlEscape("There's an error in your User Configuration: " + unicode(e))) return adapter = adapters.getAdapter(configuration, url) adapter.setChaptersRange(ch_begin, ch_end) logging.info('Created an adaper: %s' % adapter) if login or password: adapter.username = login adapter.password = password adapter.is_adult = is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc ## without waiting for the queue. So I think ## it's worth the double up. Could maybe save ## it all in the download object someday. story = adapter.getStoryMetadataOnly() ## Fetch again using normalized story URL. The one ## fetched/created above, if different, will not be saved. download = getDownloadMeta(url=story.getMetadata('storyUrl'), user=user, format=format, new=True) download.title = story.getMetadata('title') download.author = story.getMetadata('author') download.url = story.getMetadata('storyUrl') download.ch_begin = ch_begin download.ch_end = ch_end download.put() taskqueue.add(url='/fdowntask', queue_name="download", params={ 'id': unicode(download.key()), 'format': format, 'url': download.url, 'login': login, 'password': password, 'user': user.email(), 'is_adult': is_adult }) logging.info("enqueued download key: " + unicode(download.key()))
def post(self): logging.getLogger().setLevel(logging.DEBUG) user = users.get_current_user() if not user: self.redirect(users.create_login_url(self.request.uri)) return format = self.request.get("format") url = self.request.get("url") if not url or url.strip() == "": self.redirect("/") return # Allow chapter range with URL. # test1.com?sid=5[4-6] mc = re.match(r"^(?P<url>.*?)(?:\[(?P<begin>\d+)?(?P<comma>[,-])?(?P<end>\d+)?\])?$", url) # print("url:(%s) begin:(%s) end:(%s)"%(mc.group('url'),mc.group('begin'),mc.group('end'))) url = mc.group("url") ch_begin = mc.group("begin") ch_end = mc.group("end") if ch_begin and not mc.group("comma"): ch_end = ch_begin logging.info("Queuing Download: %s" % url) login = self.request.get("login") password = self.request.get("password") is_adult = self.request.get("is_adult") == "on" # use existing record if available. Fetched/Created before # the adapter can normalize the URL in case we need to record # an exception. download = getDownloadMeta(url=url, user=user, format=format, new=True) adapter = None try: try: configuration = self.getUserConfig(user, url, format) except Exception, e: self.redirect( "/?error=custom&errtext=%s" % urlEscape("There's an error in your User Configuration: " + str(e)) ) return adapter = adapters.getAdapter(configuration, url) adapter.setChaptersRange(ch_begin, ch_end) logging.info("Created an adaper: %s" % adapter) if login or password: adapter.username = login adapter.password = password adapter.is_adult = is_adult ## This scrapes the metadata, which will be ## duplicated in the queue task, but it ## detects bad URLs, bad login, bad story, etc ## without waiting for the queue. So I think ## it's worth the double up. Could maybe save ## it all in the download object someday. story = adapter.getStoryMetadataOnly() ## Fetch again using normalized story URL. The one ## fetched/created above, if different, will not be saved. download = getDownloadMeta(url=story.getMetadata("storyUrl"), user=user, format=format, new=True) download.title = story.getMetadata("title") download.author = story.getMetadata("author") download.url = story.getMetadata("storyUrl") download.ch_begin = ch_begin download.ch_end = ch_end download.put() taskqueue.add( url="/fdowntask", queue_name="download", params={ "id": str(download.key()), "format": format, "url": download.url, "login": login, "password": password, "user": user.email(), "is_adult": is_adult, }, ) logging.info("enqueued download key: " + str(download.key()))
def post(self): logging.getLogger().setLevel(logging.DEBUG) fileId = self.request.get('id') # User object can't pass, just email address user = users.User(self.request.get('user')) format = self.request.get('format') url = self.request.get('url') login = self.request.get('login') password = self.request.get('password') is_adult = self.request.get('is_adult') logging.info("Downloading: " + url + " for user: "******"ID: " + fileId) adapter = None writerClass = None # use existing record if available. # fileId should have record from /fdown. download = getDownloadMeta(id=fileId, url=url, user=user, format=format, new=True) for c in download.data_chunks: c.delete() download.put() logging.info('Creating adapter...') try: configuration = self.getUserConfig(user, url, format) adapter = adapters.getAdapter(configuration, url) adapter.setChaptersRange(download.ch_begin, download.ch_end) logging.info('Created an adapter: %s' % adapter) if login or password: adapter.username = login adapter.password = password adapter.is_adult = is_adult # adapter.getStory() is what does all the heavy lifting. # adapter.getStoryMetadataOnly() only fetches enough to # get metadata. writer.writeStory() will call # adapter.getStory(), too. writer = writers.getWriter(format, configuration, adapter) download.name = writer.getOutputFileName() #logging.debug('output_filename:'+writer.getConfig('output_filename')) logging.debug('getOutputFileName:' + writer.getOutputFileName()) download.title = adapter.getStory().getMetadata('title') download.author = adapter.getStory().getMetadata('author') download.url = adapter.getStory().getMetadata('storyUrl') download.put() allmeta = adapter.getStory().getAllMetadata(removeallentities=True, doreplacements=False) outbuffer = StringIO() writer.writeStory(outbuffer) data = outbuffer.getvalue() outbuffer.close() del outbuffer #del writer.adapter #del writer.story del writer #del adapter.story del adapter # epubs are all already compressed. Each chunk is # compressed individually to avoid having to hold the # whole in memory just for the compress/uncompress. if format != 'epub': def c(data): return zlib.compress(data) else: def c(data): return data # delete existing chunks first for c in download.data_chunks: c.delete() index = 0 while (len(data) > 0): DownloadData(download=download, index=index, blob=c(data[:1000000])).put() index += 1 data = data[1000000:] download.completed = True download.put() smetal = SavedMeta.all().filter('url =', allmeta['storyUrl']).fetch(1) if smetal and smetal[0]: smeta = smetal[0] smeta.count += 1 else: smeta = SavedMeta() smeta.count = 1 smeta.url = allmeta['storyUrl'] smeta.title = allmeta['title'] smeta.author = allmeta['author'] smeta.meta = allmeta smeta.date = datetime.datetime.now() smeta.put() logging.info("Download finished OK") del data except Exception, e: logging.exception(e) download.failure = unicode(e) download.put() return
def post(self): logging.getLogger().setLevel(logging.DEBUG) fileId = self.request.get("id") # User object can't pass, just email address user = users.User(self.request.get("user")) format = self.request.get("format") url = self.request.get("url") login = self.request.get("login") password = self.request.get("password") is_adult = self.request.get("is_adult") logging.info("Downloading: " + url + " for user: "******"ID: " + fileId) adapter = None writerClass = None # use existing record if available. # fileId should have record from /fdown. download = getDownloadMeta(id=fileId, url=url, user=user, format=format, new=True) for c in download.data_chunks: c.delete() download.put() logging.info("Creating adapter...") try: configuration = self.getUserConfig(user, url, format) adapter = adapters.getAdapter(configuration, url) adapter.setChaptersRange(download.ch_begin, download.ch_end) logging.info("Created an adapter: %s" % adapter) if login or password: adapter.username = login adapter.password = password adapter.is_adult = is_adult # adapter.getStory() is what does all the heavy lifting. # adapter.getStoryMetadataOnly() only fetches enough to # get metadata. writer.writeStory() will call # adapter.getStory(), too. writer = writers.getWriter(format, configuration, adapter) download.name = writer.getOutputFileName() # logging.debug('output_filename:'+writer.getConfig('output_filename')) logging.debug("getOutputFileName:" + writer.getOutputFileName()) download.title = adapter.getStory().getMetadata("title") download.author = adapter.getStory().getMetadata("author") download.url = adapter.getStory().getMetadata("storyUrl") download.put() allmeta = adapter.getStory().getAllMetadata(removeallentities=True, doreplacements=False) outbuffer = StringIO() writer.writeStory(outbuffer) data = outbuffer.getvalue() outbuffer.close() del outbuffer # del writer.adapter # del writer.story del writer # del adapter.story del adapter # epubs are all already compressed. Each chunk is # compressed individually to avoid having to hold the # whole in memory just for the compress/uncompress. if format != "epub": def c(data): return zlib.compress(data) else: def c(data): return data # delete existing chunks first for c in download.data_chunks: c.delete() index = 0 while len(data) > 0: DownloadData(download=download, index=index, blob=c(data[:1000000])).put() index += 1 data = data[1000000:] download.completed = True download.put() smetal = SavedMeta.all().filter("url =", allmeta["storyUrl"]).fetch(1) if smetal and smetal[0]: smeta = smetal[0] smeta.count += 1 else: smeta = SavedMeta() smeta.count = 1 smeta.url = allmeta["storyUrl"] smeta.title = allmeta["title"] smeta.author = allmeta["author"] smeta.meta = allmeta smeta.date = datetime.datetime.now() smeta.put() logging.info("Download finished OK") del data except Exception, e: logging.exception(e) download.failure = unicode(e) download.put() return
def do_download(arg, options, passed_defaultsini, passed_personalini, warn=print, fail=print): # Attempt to update an existing epub. chaptercount = None output_filename = None if options.unnew: # remove mark_new_chapters marks reset_orig_chapters_epub(arg, arg) return if options.update: try: url, chaptercount = get_dcsource_chaptercount(arg) if not url: fail('No story URL found in epub to update.') return print('Updating %s, URL: %s' % (arg, url)) output_filename = arg except Exception: # if there's an error reading the update file, maybe it's a URL? # we'll look for an existing outputfile down below. url = arg else: url = arg configuration = get_configuration(url, passed_defaultsini, passed_personalini, options, chaptercount, output_filename) try: # Allow chapter range with URL. # like test1.com?sid=5[4-6] or [4,6] # Overrides CLI options if present. url, ch_begin, ch_end = adapters.get_url_chapter_range(url) adapter = adapters.getAdapter(configuration, url) # url[begin-end] overrides CLI option if present. if ch_begin or ch_end: adapter.setChaptersRange(ch_begin, ch_end) else: adapter.setChaptersRange(options.begin, options.end) # check for updating from URL (vs from file) update_story = options.update if update_story and not chaptercount: try: writer = writers.getWriter('epub', configuration, adapter) output_filename = writer.getOutputFileName() noturl, chaptercount = get_dcsource_chaptercount( output_filename) print('Updating %s, URL: %s' % (output_filename, url)) except Exception as e: warn( "Failed to read epub for update: (%s) Continuing with update=false" % e) update_story = False # Check for include_images without no_image_processing. In absence of PIL, give warning. if adapter.getConfig('include_images') and not adapter.getConfig( 'no_image_processing'): try: from calibre.utils.magick import Image except ImportError: try: ## Pillow is a more current fork of PIL library from PIL import Image except ImportError: try: import Image except ImportError: print( "You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?" ) if options.interactive: if not sys.stdin.readline().strip().lower( ).startswith('y'): return else: # for non-interactive, default the response to yes and continue processing print('y') # three tries, that's enough if both user/pass & is_adult needed, # or a couple tries of one or the other for x in range(0, 2): try: adapter.getStoryMetadataOnly() except exceptions.FailedToLogin as f: if not options.interactive: print( 'Login Failed on non-interactive process. Set username and password in personal.ini.' ) return if f.passwdonly: print('Story requires a password.') else: print('Login Failed, Need Username/Password.') sys.stdout.write('Username: '******'Password: '******'Login: `%s`, Password: `%s`' % (adapter.username, adapter.password)) except exceptions.AdultCheckRequired: if options.interactive: print( 'Please confirm you are an adult in your locale: (y/n)?' ) if sys.stdin.readline().strip().lower().startswith('y'): adapter.is_adult = True else: print( 'Adult check required on non-interactive process. Set is_adult:true in personal.ini or pass -o "is_adult=true" to the command.' ) return if update_story and not options.force: urlchaptercount = int(adapter.getStoryMetadataOnly().getMetadata( 'numChapters').replace(',', '')) # returns int adjusted for start-end range. urlchaptercount = adapter.getStoryMetadataOnly().getChapterCount() if chaptercount == urlchaptercount and not options.metaonly and not options.updatealways: print('%s already contains %d chapters.' % (output_filename, chaptercount)) elif chaptercount > urlchaptercount: warn('%s contains %d chapters, more than source: %d.' % (output_filename, chaptercount, urlchaptercount)) elif chaptercount == 0: warn( "%s doesn't contain any recognizable chapters, probably from a different source. Not updating." % output_filename) else: # update now handled by pre-populating the old # images and chapters in the adapter rather than # merging epubs. (url, chaptercount, adapter.oldchapters, adapter.oldimgs, adapter.oldcover, adapter.calibrebookmark, adapter.logfile, adapter.oldchaptersmap, adapter.oldchaptersdata) = ( get_update_data(output_filename))[0:9] print('Do update - epub(%d) vs url(%d)' % (chaptercount, urlchaptercount)) if not update_story and chaptercount == urlchaptercount and adapter.getConfig( 'do_update_hook'): adapter.hookForUpdates(chaptercount) if adapter.getConfig('pre_process_safepattern'): metadata = adapter.story.get_filename_safe_metadata( pattern=adapter.getConfig('pre_process_safepattern')) else: metadata = adapter.story.getAllMetadata() call(string.Template( adapter.getConfig('pre_process_cmd')).substitute(metadata), shell=True) output_filename = write_story(configuration, adapter, 'epub', nooutput=options.nooutput) else: if not options.metaonly and adapter.getConfig('pre_process_cmd'): if adapter.getConfig('pre_process_safepattern'): metadata = adapter.story.get_filename_safe_metadata( pattern=adapter.getConfig('pre_process_safepattern')) else: metadata = adapter.story.getAllMetadata() call(string.Template( adapter.getConfig('pre_process_cmd')).substitute(metadata), shell=True) output_filename = write_story(configuration, adapter, options.format, metaonly=options.metaonly, nooutput=options.nooutput) if options.metaonly and not options.jsonmeta: metadata = adapter.getStoryMetadataOnly().getAllMetadata() metadata['output_filename'] = output_filename if not options.nometachapters: metadata['zchapters'] = [] for i, chap in enumerate(adapter.get_chapters()): metadata['zchapters'].append((i + 1, chap)) else: # If no chapters, also suppress output_css so # metadata is shorter. del metadata['output_css'] pprint.pprint(metadata) if not options.metaonly and adapter.getConfig('post_process_cmd'): if adapter.getConfig('post_process_safepattern'): metadata = adapter.story.get_filename_safe_metadata( pattern=adapter.getConfig('post_process_safepattern')) else: metadata = adapter.story.getAllMetadata() metadata['output_filename'] = output_filename call(string.Template( adapter.getConfig('post_process_cmd')).substitute(metadata), shell=True) if options.jsonmeta or options.jsonmetafile: metadata = adapter.getStoryMetadataOnly().getAllMetadata() metadata['output_filename'] = output_filename if not options.nometachapters: metadata['zchapters'] = [] for i, chap in enumerate(adapter.get_chapters()): metadata['zchapters'].append((i + 1, chap)) import json if options.jsonmeta: print( json.dumps(metadata, sort_keys=True, indent=2, separators=(',', ':'))) if options.jsonmetafile: with open(output_filename + ".json", "w") as jsonfile: json.dump(metadata, jsonfile, sort_keys=True, indent=2, separators=(',', ':')) if adapter.story.chapter_error_count > 0: warn( "===================\n!!!! %s chapters errored downloading %s !!!!\n===================" % (adapter.story.chapter_error_count, url)) del adapter except exceptions.InvalidStoryURL as isu: fail(isu) except exceptions.StoryDoesNotExist as dne: fail(dne) except exceptions.UnknownSite as us: fail(us) except exceptions.AccessDenied as ad: fail(ad)
def do_download_for_worker(book, options, merge, notification=lambda x, y: x): ''' Child job, to download story when run as a worker job ''' from calibre_plugins.fanficfare_plugin import FanFicFareBase fffbase = FanFicFareBase(options['plugin_path']) with fffbase: # so the sys.path was modified while loading the # plug impl. from calibre_plugins.fanficfare_plugin.dialogs import NotGoingToDownload from calibre_plugins.fanficfare_plugin.prefs import ( SAVE_YES, SAVE_YES_UNLESS_SITE, OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY, CALIBREONLYSAVECOL) from calibre_plugins.fanficfare_plugin.wordcount import get_word_count from fanficfare import adapters, writers from fanficfare.epubutils import get_update_data from fanficfare.six import text_type as unicode from calibre_plugins.fanficfare_plugin.fff_util import get_fff_config try: logger.info("\n\n" + ("-" * 80) + " " + book['url']) ## No need to download at all. Can happen now due to ## collision moving into book for CALIBREONLY changing to ## ADDNEW when story URL not in library. if book['collision'] in (CALIBREONLY, CALIBREONLYSAVECOL): logger.info("Skipping CALIBREONLY 'update' down inside worker") return book book['comment'] = _('Download started...') configuration = get_fff_config(book['url'], options['fileform'], options['personal.ini']) if not options[ 'updateepubcover'] and 'epub_for_update' in book and book[ 'collision'] in (UPDATE, UPDATEALWAYS): configuration.set("overrides", "never_make_cover", "true") # images only for epub, html, even if the user mistakenly # turned it on else where. if options['fileform'] not in ("epub", "html"): configuration.set("overrides", "include_images", "false") adapter = adapters.getAdapter(configuration, book['url']) adapter.is_adult = book['is_adult'] adapter.username = book['username'] adapter.password = book['password'] adapter.setChaptersRange(book['begin'], book['end']) ## each site download job starts with a new copy of the ## cookiejar and basic_cache from the FG process. They ## are not shared between different sites' BG downloads if configuration.getConfig('use_browser_cache'): if 'browser_cache' in options: configuration.set_browser_cache(options['browser_cache']) else: options['browser_cache'] = configuration.get_browser_cache( ) if 'browser_cachefile' in options: options['browser_cache'].load_cache( options['browser_cachefile']) if 'basic_cache' in options: configuration.set_basic_cache(options['basic_cache']) else: options['basic_cache'] = configuration.get_basic_cache() options['basic_cache'].load_cache(options['basic_cachefile']) if 'cookiejar' in options: configuration.set_cookiejar(options['cookiejar']) else: options['cookiejar'] = configuration.get_cookiejar() options['cookiejar'].load_cookiejar(options['cookiejarfile']) story = adapter.getStoryMetadataOnly() if not story.getMetadata("series") and 'calibre_series' in book: adapter.setSeries(book['calibre_series'][0], book['calibre_series'][1]) # set PI version instead of default. if 'version' in options: story.setMetadata('version', options['version']) book['title'] = story.getMetadata("title", removeallentities=True) book['author_sort'] = book['author'] = story.getList( "author", removeallentities=True) book['publisher'] = story.getMetadata("publisher") book['url'] = story.getMetadata("storyUrl", removeallentities=True) book['tags'] = story.getSubjectTags(removeallentities=True) book['comments'] = story.get_sanitized_description() book['series'] = story.getMetadata("series", removeallentities=True) if story.getMetadataRaw('datePublished'): book['pubdate'] = story.getMetadataRaw( 'datePublished').replace(tzinfo=local_tz) if story.getMetadataRaw('dateUpdated'): book['updatedate'] = story.getMetadataRaw( 'dateUpdated').replace(tzinfo=local_tz) if story.getMetadataRaw('dateCreated'): book['timestamp'] = story.getMetadataRaw( 'dateCreated').replace(tzinfo=local_tz) else: book['timestamp'] = datetime.now().replace( tzinfo=local_tz) # need *something* there for calibre. writer = writers.getWriter(options['fileform'], configuration, adapter) outfile = book['outfile'] ## checks were done earlier, it's new or not dup or newer--just write it. if book['collision'] in (ADDNEW, SKIP, OVERWRITE, OVERWRITEALWAYS) or \ ('epub_for_update' not in book and book['collision'] in (UPDATE, UPDATEALWAYS)): # preserve logfile even on overwrite. if 'epub_for_update' in book: adapter.logfile = get_update_data( book['epub_for_update'])[6] # change the existing entries id to notid so # write_epub writes a whole new set to indicate overwrite. if adapter.logfile: adapter.logfile = adapter.logfile.replace( "span id", "span notid") if book['collision'] == OVERWRITE and 'fileupdated' in book: lastupdated = story.getMetadataRaw('dateUpdated') fileupdated = book['fileupdated'] # updated doesn't have time (or is midnight), use dates only. # updated does have time, use full timestamps. if (lastupdated.time() == time.min and fileupdated.date() > lastupdated.date()) or \ (lastupdated.time() != time.min and fileupdated > lastupdated): raise NotGoingToDownload( _("Not Overwriting, web site is not newer."), 'edit-undo.png', showerror=False) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True, notification=notification) if adapter.story.chapter_error_count > 0: book['comment'] = _('Download %(fileform)s completed, %(failed)s failed chapters, %(total)s total chapters.')%\ {'fileform':options['fileform'], 'failed':adapter.story.chapter_error_count, 'total':story.getMetadata("numChapters")} book[ 'chapter_error_count'] = adapter.story.chapter_error_count else: book['comment'] = _('Download %(fileform)s completed, %(total)s chapters.')%\ {'fileform':options['fileform'], 'total':story.getMetadata("numChapters")} book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() ## checks were done earlier, just update it. elif 'epub_for_update' in book and book['collision'] in ( UPDATE, UPDATEALWAYS): # update now handled by pre-populating the old images and # chapters in the adapter rather than merging epubs. #urlchaptercount = int(story.getMetadata('numChapters').replace(',','')) # returns int adjusted for start-end range. urlchaptercount = story.getChapterCount() (url, chaptercount, adapter.oldchapters, adapter.oldimgs, adapter.oldcover, adapter.calibrebookmark, adapter.logfile, adapter.oldchaptersmap, adapter.oldchaptersdata) = get_update_data( book['epub_for_update'])[0:9] # dup handling from fff_plugin needed for anthology updates. if book['collision'] == UPDATE: if chaptercount == urlchaptercount: if merge: book['comment'] = _( "Already contains %d chapters. Reuse as is." ) % chaptercount book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata( ) book['outfile'] = book[ 'epub_for_update'] # for anthology merge ops. return book else: # not merge, raise NotGoingToDownload( _("Already contains %d chapters.") % chaptercount, 'edit-undo.png', showerror=False) elif chaptercount > urlchaptercount: raise NotGoingToDownload( _("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update." ) % (chaptercount, urlchaptercount), 'dialog_error.png') elif chaptercount == 0: raise NotGoingToDownload( _("FanFicFare doesn't recognize chapters in existing epub, epub is probably from a different source. Use Overwrite to force update." ), 'dialog_error.png') if not (book['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \ and adapter.getConfig("do_update_hook"): chaptercount = adapter.hookForUpdates(chaptercount) logger.info("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)) logger.info("write to %s" % outfile) inject_cal_cols(book, story, configuration) writer.writeStory(outfilename=outfile, forceOverwrite=True, notification=notification) if adapter.story.chapter_error_count > 0: book['comment'] = _('Update %(fileform)s completed, added %(added)s chapters, %(failed)s failed chapters, for %(total)s total.')%\ {'fileform':options['fileform'], 'failed':adapter.story.chapter_error_count, 'added':(urlchaptercount-chaptercount), 'total':urlchaptercount} book[ 'chapter_error_count'] = adapter.story.chapter_error_count else: book['comment'] = _('Update %(fileform)s completed, added %(added)s chapters for %(total)s total.')%\ {'fileform':options['fileform'],'added':(urlchaptercount-chaptercount),'total':urlchaptercount} book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() else: ## Shouldn't ever get here, but hey, it happened once ## before with prefs['collision'] raise Exception( "Impossible state reached -- Book: %s:\nOptions:%s:" % (book, options)) if options['do_wordcount'] == SAVE_YES or ( options['do_wordcount'] == SAVE_YES_UNLESS_SITE and not story.getMetadataRaw('numWords')): try: wordcount = get_word_count(outfile) # logger.info("get_word_count:%s"%wordcount) story.setMetadata('numWords', wordcount) writer.writeStory(outfilename=outfile, forceOverwrite=True) book['all_metadata'] = story.getAllMetadata( removeallentities=True) if options['savemetacol'] != '': book['savemetacol'] = story.dump_html_metadata() except: logger.error("WordCount failed") if options['smarten_punctuation'] and options['fileform'] == "epub" \ and calibre_version >= (0, 9, 39): # for smarten punc from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS from calibre.utils.logging import Log from collections import namedtuple # do smarten_punctuation from calibre's polish feature data = {'smarten_punctuation': True} opts = ALL_OPTS.copy() opts.update(data) O = namedtuple('Options', ' '.join(six.iterkeys(ALL_OPTS))) opts = O(**opts) log = Log(level=Log.DEBUG) polish({outfile: outfile}, opts, log, logger.info) except NotGoingToDownload as d: book['good'] = False book['status'] = _('Bad') book['showerror'] = d.showerror book['comment'] = unicode(d) book['icon'] = d.icon except Exception as e: book['good'] = False book['status'] = _('Error') book['comment'] = unicode(e) book['icon'] = 'dialog_error.png' book['status'] = _('Error') logger.info("Exception: %s:%s" % (book, book['comment']), exc_info=True) return book