Python Cache 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: urlgrab

클래스/타입: Cache

hotexamples.com에서의 예제들: 7

Python Cache - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 urlgrab.Cache에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

user_agent(2)

Cache(1)

get(1)

예제 #1

파일 보기

파일: epgrabber.py 프로젝트: palfrey/epgrabber

def setup(options):
	from episodes_pb2 import All

	db = All()
	db.ParseFromString(open(options.database,"rb").read())
	yesterday = date.fromtimestamp(time())-timedelta(days=1)
	yesterday = yesterday.timetuple()

	cache = Cache(debug=options.debug)
	cache.user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3"

	items = {"yesterday":yesterday, "cache":cache, "db": db}
	for x in items:
		globals()[x] = items[x]
	return items

예제 #2

파일 보기

파일: literotica.py 프로젝트: palfrey/book-blog

from common import *
from sys import argv
from urlgrab import Cache
from re import compile, DOTALL, MULTILINE

cache = Cache()
url = argv[1]

titlePattern = compile("<h1>([^<]+)</h1>")
contentPattern = compile("<div class=\"b-story-body-x x-r15\">(.+?)</div><div class=\"b-story-stats-block\">" , DOTALL|MULTILINE)
nextPattern = compile("\"([^\"]+)\">Next</a>")

chapterPattern = compile("(.*?) (?:Ch.|Pt.) (\d+)")
memberPattern = compile("<a href=\"(https://www.literotica.com/stories/memberpage.php\?uid=\d+&amp;page=submissions)\">([^<]+)</a>")
chapterLinkPattern = compile("href=\"(https://www.literotica.com/s/[^\"]+)\">([^<]+)</a>")

page = cache.get(url, max_age = -1)
data = page.read()
open("dump", "wb").write(data.encode("utf-8"))

title = titlePattern.findall(data)
title = title[0]

chapter = chapterPattern.match(title)
if chapter != None:
	title = chapter.groups()[0]
	currentChapter = 1

print title

toc = tocStart(title)

예제 #3

파일 보기

파일: ao3.py 프로젝트: palfrey/book-blog

from sys import argv
from urlgrab import Cache
from codecs import open
import re
from common import *
from urlparse import urljoin

cache = Cache()
url = argv[1]

id = re.search("/works/(\d+)", url)
id = id.groups()[0]

navigate = "http://archiveofourown.org/works/%s/navigate"%id
print navigate

data = cache.get(navigate).read()
data = data.decode("utf-8")
info = re.search("<h2 class=\"heading\">Chapter Index for <a href=\"/works/\d+\">([^<]+)</a> by <a href=\"[^\"]+\" rel=\"author\">([^<]+)</a></h2>", data)
(title, author) = info.groups()

titlePattern = re.compile("<h2 class=\"title heading\">\s+(.*?)\s+</h2>")
summary = re.compile("<div[^>]+?class=\"summary module\"[^>]*?>(.+?)</div>", re.DOTALL|re.MULTILINE)
notes = re.compile("<div.+?class=\"notes module\"[^>]*>(.+?)</div>", re.DOTALL|re.MULTILINE)
mainContent = re.compile("<h3 class=\"landmark heading\" id=\"work\">Chapter Text</h3>(.*?)<!--/main-->", re.DOTALL|re.MULTILINE)
volumePattern = re.compile("<li><a href=\"(/works/\d+/chapters/\d+)\">(\d+). ([^<]+)</a>")

volumes = sorted(volumePattern.findall(data))

print volumes
volumes = dict([(int(x[1]), (x[0],x[2])) for x in volumes])

예제 #4

파일 보기

파일: books.py 프로젝트: palfrey/book-blog

# -*- coding: utf-8 -*-
from urlgrab import Cache
from google.protobuf import text_format
from blog_pb2 import All
from re import compile, DOTALL, MULTILINE
from os.path import exists, join
from codecs import open
from urlparse import urljoin
from optparse import OptionParser
from common import generatePage, tocStart, tocEnd, makeMobi

c = Cache()
c.user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3"

db = All()
text_format.Merge(open("series.txt","rb","utf-8").read(),db)
stripTags = compile("<[^>]+>");
stripAnchorTags = compile("(?:<a[^>]+>)|(?:</a>)");

# Kindle doesn't like various characters, so lets rewrite some of them...
wrong = {
		u"â€œ": u"\"",
		u"â€™": u"'",
		u"â€": u"\"",
		u"â€˜": u"'",
		u"â€”": u" - ",
		u"â€¦": u"-",
		u"": u"",
		u'“': u"\"",
		u'”':u"\"",
		u'–':u"-",

예제 #5

파일 보기

파일: bofh.py 프로젝트: palfrey/book-blog

from common import *
from re import compile, DOTALL, MULTILINE
from urlgrab import Cache
from urlparse import urljoin

linkPattern = compile("<h3><a href=\"(/[^\"]+)\">(.+?)</a></h3>")
earlierPattern = compile("<a href='([^\']+)'>.+?Earlier Stories.+?</a>", DOTALL | MULTILINE)
titlePattern = compile("<h2>(.+?)</h2>")
subtitlePattern = compile("<p class=\"standfirst\">(.+?)</p>")
contentPattern = compile("<strong class=\"trailer\">.+?</p>(.+?)(?:(?:<p>(?:(?:<i>)|(?:<small>)|(?:<font size=\"-2\">)|(?:<br>\n))?BOFH .+? Simon Travaglia)|(?:<ul class=\"noindent\">)|(?:<ul>.+?<li><a href=\"http://www.theregister.co.uk/content/30/index.html\">BOFH: The whole shebang</a></li>)|(?:</form>))", DOTALL| MULTILINE)
adPattern = compile("(<div id=ad-mu1-spot>.+?</div>)", MULTILINE | DOTALL)
episodePattern = compile("<strong class=\"trailer\">Episode \d+")

url = "http://www.theregister.co.uk/data_centre/bofh/"
pages = [url]
cache = Cache()

while True:
	print url
	data = cache.get(url).read()
	links = linkPattern.findall(data)

	if links == []:
		break

	pages.insert(0, url)

	earlier = earlierPattern.findall(data)
	url = urljoin(url, earlier[0])

skipTitles = ["Salmon Days is Go!"]

예제 #6

파일 보기

파일: fanfiction.py 프로젝트: palfrey/book-blog

from sys import argv
from urlgrab import Cache
from codecs import open
import re

cache = Cache()
url = argv[1]
data = cache.get(url).read()
open("dump", "wb", "utf-8").write(data)

title = re.search("<title>(.+?) Chapter \d+", data)
title = title.groups()
author = re.search("By:</span> <a[^>]+?href='/u/\d+/[^']+'>([^<]+)</a>", data)
author = author.groups()[0]
id = re.search("/s/(\d+)", url)
id = id.groups()[0]

print """series {
	name: "%s"
	description: "%s"
	author: "%s"
	startPage: "http://m.fanfiction.net/s/%s/1"
	titlePattern: "<img src='/[^']+/balloon.png' class='mt icons'>[\d,]+</a></span>(.+?)<br>"
	contentPattern: "id='storycontent' >(.+?)</div></div>.*?<hr size=1"
	nextPattern: "<a href='(/s/\d+/\d+/)'>Next &#187;</a>"
}"""%(title[0].replace(" ",""), title[0], author, id)

예제 #7

파일 보기

#!/usr/bin/python

from urlgrab import Cache
from BeautifulSoup import MinimalSoup as BeautifulSoup
from re import compile
from os.path import exists, getsize, dirname, join
from urllib import urlretrieve, urlencode, quote
from sys import argv
import demjson
import zlib

folder = dirname(argv[0])

cache = Cache(debug=False)

pages = []

for index in range(1,11):
        index = cache.get("http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d"%index, max_age=60*60*2).read()
        index = index.replace("''>","'>")
        index = BeautifulSoup(index)

        for link in index.findAll("a"):
                if not link.has_key("href"):
                        continue
                if link["href"].find("http://www.escapistmagazine.com/videos/view/zero-punctuation/")!=-1:
                        short_href = link["href"]
                        slash = short_href.rfind("/")
                        if short_href[slash:].find("-")!=-1:
                                short_href = short_href[slash+1:slash+short_href[slash:].find("-")]
                        else: