def crawl(self, url,fun, deep=5): page = TylCrawlerPage(url=url) fetcher = TylCrawlerFetcher() pages = [] pages.append([page]) try: for i in range(deep): pn = [] for p in pages[i]: fetcher.fetch(p) p.fetcher = fetcher links = p.getLinks(self.host) fun(p) if hasattr(self, "sleepSec"): time.sleep(self.sleepSec) if (i+1) == deep: continue for link in links: link = self.fixUrl(link) if self.urlCrawled(link): continue; self.crawledList.append(link) pchild = TylCrawlerPage(url=link) pchild.setReferer(p.url) pchild.cookieJar = p.cookieJar pchild.level = i+1 pn.append(pchild) #print link pages.append(pn) except ValueError as e: print e
page.fetched = True if not hasattr(page, "url"): return None try: req = urllib2.Request(page.url) for x in page.headers: req.add_header(x, page.headers[x]) if page.cookieJar is None: page.cookieJar = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(page.cookieJar)) r = opener.open(req,timeout=self.timeout) response = r.read() page.code = r.getcode() page.content = response for header in r.info().headers: pair = header.split(":") headerKey = pair[0].strip() headerValue = pair[1].strip() page.responseHeaders[headerKey] = headerValue except Exception,e: print e if __name__ == "__main__": from page import TylCrawlerPage fetcher = TylFetcher() page = TylCrawlerPage(url="http://www.okbuy.com/") fetcher.fetch(page) page.code page.getLinks()