def setup(options): from episodes_pb2 import All db = All() db.ParseFromString(open(options.database,"rb").read()) yesterday = date.fromtimestamp(time())-timedelta(days=1) yesterday = yesterday.timetuple() cache = Cache(debug=options.debug) cache.user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3" items = {"yesterday":yesterday, "cache":cache, "db": db} for x in items: globals()[x] = items[x] return items
from common import * from sys import argv from urlgrab import Cache from re import compile, DOTALL, MULTILINE cache = Cache() url = argv[1] titlePattern = compile("<h1>([^<]+)</h1>") contentPattern = compile("<div class=\"b-story-body-x x-r15\">(.+?)</div><div class=\"b-story-stats-block\">" , DOTALL|MULTILINE) nextPattern = compile("\"([^\"]+)\">Next</a>") chapterPattern = compile("(.*?) (?:Ch.|Pt.) (\d+)") memberPattern = compile("<a href=\"(https://www.literotica.com/stories/memberpage.php\?uid=\d+&page=submissions)\">([^<]+)</a>") chapterLinkPattern = compile("href=\"(https://www.literotica.com/s/[^\"]+)\">([^<]+)</a>") page = cache.get(url, max_age = -1) data = page.read() open("dump", "wb").write(data.encode("utf-8")) title = titlePattern.findall(data) title = title[0] chapter = chapterPattern.match(title) if chapter != None: title = chapter.groups()[0] currentChapter = 1 print title toc = tocStart(title)
from sys import argv from urlgrab import Cache from codecs import open import re from common import * from urlparse import urljoin cache = Cache() url = argv[1] id = re.search("/works/(\d+)", url) id = id.groups()[0] navigate = "http://archiveofourown.org/works/%s/navigate"%id print navigate data = cache.get(navigate).read() data = data.decode("utf-8") info = re.search("<h2 class=\"heading\">Chapter Index for <a href=\"/works/\d+\">([^<]+)</a> by <a href=\"[^\"]+\" rel=\"author\">([^<]+)</a></h2>", data) (title, author) = info.groups() titlePattern = re.compile("<h2 class=\"title heading\">\s+(.*?)\s+</h2>") summary = re.compile("<div[^>]+?class=\"summary module\"[^>]*?>(.+?)</div>", re.DOTALL|re.MULTILINE) notes = re.compile("<div.+?class=\"notes module\"[^>]*>(.+?)</div>", re.DOTALL|re.MULTILINE) mainContent = re.compile("<h3 class=\"landmark heading\" id=\"work\">Chapter Text</h3>(.*?)<!--/main-->", re.DOTALL|re.MULTILINE) volumePattern = re.compile("<li><a href=\"(/works/\d+/chapters/\d+)\">(\d+). ([^<]+)</a>") volumes = sorted(volumePattern.findall(data)) print volumes volumes = dict([(int(x[1]), (x[0],x[2])) for x in volumes])
# -*- coding: utf-8 -*- from urlgrab import Cache from google.protobuf import text_format from blog_pb2 import All from re import compile, DOTALL, MULTILINE from os.path import exists, join from codecs import open from urlparse import urljoin from optparse import OptionParser from common import generatePage, tocStart, tocEnd, makeMobi c = Cache() c.user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3" db = All() text_format.Merge(open("series.txt","rb","utf-8").read(),db) stripTags = compile("<[^>]+>"); stripAnchorTags = compile("(?:<a[^>]+>)|(?:</a>)"); # Kindle doesn't like various characters, so lets rewrite some of them... wrong = { u"“": u"\"", u"’": u"'", u"â€": u"\"", u"‘": u"'", u"—": u" - ", u"…": u"-", u"": u"", u'“': u"\"", u'”':u"\"", u'–':u"-",
from common import * from re import compile, DOTALL, MULTILINE from urlgrab import Cache from urlparse import urljoin linkPattern = compile("<h3><a href=\"(/[^\"]+)\">(.+?)</a></h3>") earlierPattern = compile("<a href='([^\']+)'>.+?Earlier Stories.+?</a>", DOTALL | MULTILINE) titlePattern = compile("<h2>(.+?)</h2>") subtitlePattern = compile("<p class=\"standfirst\">(.+?)</p>") contentPattern = compile("<strong class=\"trailer\">.+?</p>(.+?)(?:(?:<p>(?:(?:<i>)|(?:<small>)|(?:<font size=\"-2\">)|(?:<br>\n))?BOFH .+? Simon Travaglia)|(?:<ul class=\"noindent\">)|(?:<ul>.+?<li><a href=\"http://www.theregister.co.uk/content/30/index.html\">BOFH: The whole shebang</a></li>)|(?:</form>))", DOTALL| MULTILINE) adPattern = compile("(<div id=ad-mu1-spot>.+?</div>)", MULTILINE | DOTALL) episodePattern = compile("<strong class=\"trailer\">Episode \d+") url = "http://www.theregister.co.uk/data_centre/bofh/" pages = [url] cache = Cache() while True: print url data = cache.get(url).read() links = linkPattern.findall(data) if links == []: break pages.insert(0, url) earlier = earlierPattern.findall(data) url = urljoin(url, earlier[0]) skipTitles = ["Salmon Days is Go!"]
from sys import argv from urlgrab import Cache from codecs import open import re cache = Cache() url = argv[1] data = cache.get(url).read() open("dump", "wb", "utf-8").write(data) title = re.search("<title>(.+?) Chapter \d+", data) title = title.groups() author = re.search("By:</span> <a[^>]+?href='/u/\d+/[^']+'>([^<]+)</a>", data) author = author.groups()[0] id = re.search("/s/(\d+)", url) id = id.groups()[0] print """series { name: "%s" description: "%s" author: "%s" startPage: "http://m.fanfiction.net/s/%s/1" titlePattern: "<img src='/[^']+/balloon.png' class='mt icons'>[\d,]+</a></span>(.+?)<br>" contentPattern: "id='storycontent' >(.+?)</div></div>.*?<hr size=1" nextPattern: "<a href='(/s/\d+/\d+/)'>Next »</a>" }"""%(title[0].replace(" ",""), title[0], author, id)
#!/usr/bin/python from urlgrab import Cache from BeautifulSoup import MinimalSoup as BeautifulSoup from re import compile from os.path import exists, getsize, dirname, join from urllib import urlretrieve, urlencode, quote from sys import argv import demjson import zlib folder = dirname(argv[0]) cache = Cache(debug=False) pages = [] for index in range(1,11): index = cache.get("http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d"%index, max_age=60*60*2).read() index = index.replace("''>","'>") index = BeautifulSoup(index) for link in index.findAll("a"): if not link.has_key("href"): continue if link["href"].find("http://www.escapistmagazine.com/videos/view/zero-punctuation/")!=-1: short_href = link["href"] slash = short_href.rfind("/") if short_href[slash:].find("-")!=-1: short_href = short_href[slash+1:slash+short_href[slash:].find("-")] else: