def get_code(num): print 'get page...' url = 'http://www.c.happycodings.com/code_snippets/code%d.html' % num text = upen(url).read() print 'got' code = re.findall('<TEXTAREA[^>]*>(.+?)</TEXTAREA>', text, re.S) return code[0]
def get_code(num): print "get page..." url = "http://www.c.happycodings.com/code_snippets/code%d.html" % num text = upen(url).read() print "got" code = re.findall("<TEXTAREA[^>]*>(.+?)</TEXTAREA>", text, re.S) return code[0]
from urllib import urlopen as upen import re base = 'http://etext.library.adelaide.edu.au/f/fitzgerald/f_scott/gatsby/' from htmlentitydefs import entitydefs, codepoint2name print entitydefs, codepoint2name[8220] fail def conv(x): x = x.group() x = int(x[2:-1]) return entitydefs[codepoint2name[x]] html = upen(base).read() all = re.findall('\<a href="chapter(\d+)\.html"\>Chapter', html) print all reg = '<div[^>]*>(.+?)</div>' for i in all: text = upen(base + 'chapter' + i + '.html').read() main = re.findall(reg, text, re.S)[1] main = re.sub('<.+?>', '', main).replace('\n\n', '<br><br>') main = main.replace('\n', ' ').replace('<br><br>', '\n') main = re.sub('&#\d+;', conv, main) open('../prog/gatsby/chapter%s.txt' % i, 'w').write(main)
#!/usr/bin/env python import cgi import sys from urllib import urlopen as upen form = cgi.FieldStorage() if form.has_key('url'): print 'Content-type:image/gif\n' sys.stdout.write(upen(form['url'].value).read()) else: print 'Content-type:text/html\n' print 'hi';
#!/usr/bin/env python import cgi import sys from urllib import urlopen as upen form = cgi.FieldStorage() if form.has_key('url'): print 'Content-type:image/gif\n' sys.stdout.write(upen(form['url'].value).read()) else: print 'Content-type:text/html\n' print 'hi'
from urllib import urlopen as upen import re base = 'http://etext.library.adelaide.edu.au/f/fitzgerald/f_scott/gatsby/' from htmlentitydefs import entitydefs,codepoint2name print entitydefs,codepoint2name[8220] fail def conv(x): x=x.group(); x=int(x[2:-1]) return entitydefs[codepoint2name[x]] html = upen(base).read() all = re.findall('\<a href="chapter(\d+)\.html"\>Chapter',html) print all reg = '<div[^>]*>(.+?)</div>' for i in all: text = upen(base+'chapter'+i+'.html').read() main = re.findall(reg,text,re.S)[1] main = re.sub('<.+?>','',main).replace('\n\n','<br><br>') main = main.replace('\n',' ').replace('<br><br>','\n') main = re.sub('&#\d+;',conv,main) open('../prog/gatsby/chapter%s.txt'%i,'w').write(main)