def export(self): """ """ logger.debug("Begin RSS Export:") db = CrawlDB() rep = Pattern() for pat in db.getPatterns(): pid = pat["pid"] pattern = pat["pattern"] description = pat["name"] items = [] for page in db.getPages("where pid=%d limit 10" % pid): items.append(self.rssitem % (page["url"], page["title"], "", pattern, "", page["url"], rep.sub(page["content"]))) itemout = "\n".join(items) output = self.rssframe % (pattern, "http://hjbbs.com/bbs", description, "Learning English Tool", itemout) logger.debug("LET %d:\n%s\n" % (pid, output)) # write out fp = open("%slet%d.xml" % (config.RSSDIR, pid), "w") fp.write(output.encode('utf8')) fp.close() logger.debug("End RSS Export.")
class CrawlPages: """ """ def __init__(self): """ """ self.db = CrawlDB() self.pat = Pattern() # self.patterns = config.PATTERNS self.titlere = re.compile(config.TITLEPATTERN) self.patterns = self.db.getPatterns() # unicode patterns def parseTitles(self): """ Fetch index page firstly, then search page content and figure out all links to be retrieved. @return list of dict(link, pattern-id) to be retrieved """ logger.info("root url: " + config.ROOTURL) sock = urllib2.urlopen(config.ROOTURL) lines = sock.readlines() sock.close() if config.DEBUG: pdb.set_trace() logger.info("Index Content: %s" % ("\n".join(lines)).decode("gbk")) prelines = [] for line in lines: if len(line) > 10: # trick, avoid useless matches for pat in self.patterns: if line.find(pat["pattern"].encode("gbk")) != -1: prelines.append({"line": line, "pid": pat["pid"]}) logger.info("catched lines num: %d " % len(prelines)) prelinks = [] for line in prelines: mline = self.titlere.search(line["line"]) if mline: # check database newurl = "http://www.hjbbs.com/"+mline.group(1) if config.DEBUG: pdb.set_trace() if not self.db.chkUrl(newurl): prelinks.append({"url": newurl, "pid": line["pid"]}) logger.info("links to be crawled num: %d " % len(prelinks)) return prelinks def loginHjbbs(self): """ Login in hjbbs, and keep cookie. Call this function before crawl any other pages. @return A boolean value to indicate login or failed """ cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) tmpfile = "code.bmp" vcodebmp = urllib2.urlopen('http://hjbbs.com/GetCode.asp').read() vcodefile = open(tmpfile, 'wb') vcodefile.write(vcodebmp) vcodefile.close() vcodenum = getCode(tmpfile) postdata=urllib.urlencode({ 'username':config.USERNAME, 'password':config.PASSWORD, 'comeurl':'http://hjbbs.com/index.asp', 'userhidden':'3', 'submit':'登录', 'CookieDate':3, 'SecurityKey':vcodenum }) postheaders = {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", "Content-Type":"application/x-www-form-urlencoded", "Referer":"http://hjbbs.com/login.asp", "Connection":"keep-alive", "Keep-Alive":115} req = urllib2.Request( url = "http://hjbbs.com/login.asp?action=chk", data = postdata, headers = postheaders ) try: res = urllib2.urlopen(req) except HTTPError, e: logger.error("loginHjbbs http failed:" + e.reason) except URLError, e: logger.error("loginHjbbs url failed:" + e.reason)