] print "initUrls", initUrls def getCurFileName(): filename = os.path.basename(__file__) return filename[0:filename.find(".")] crawler = Crawler(initUrls, initDir, headers, getCurFileName()) print "crawler初始化成功" def fn1(url): r = requests.get(url, headers=headers, timeout=100).text maxCount = BeautifulSoup(r, 'lxml').find('div', class_="page").find_all('a')[-2].text # print maxCount page_urls = [url + "/" + str(i) for i in range(1, int(maxCount) + 1)] return page_urls def fn2(url): r = requests.get(url, headers=headers, timeout=100).text return BeautifulSoup(r, 'lxml').find('div', class_="content").find('a').img['src'] crawler.then(fn1).then(fn2)
} # 最大208 initUrls=["http://jandan.net/ooxx/page-{num}#comments".format(num=num) for num in range(207,208)] print "initUrls",initUrls def getCurFileName(): filename=os.path.basename(__file__) return filename[0:filename.find(".")] crawler=Crawler(initUrls,initDir,headers,getCurFileName()); print "crawler初始化成功" def fn1(url): arr=[] r = requests.get(url, headers=headers, timeout=100).text for father in BeautifulSoup(r, 'lxml').find_all('div',class_="row"): link=father.find("div",class_="text").img['src'] arr.append('http:'+link) return arr # def fn2(url): # r = requests.get(url, headers=headers, timeout=100).text # return BeautifulSoup(r, 'lxml').find('div',class_="content").find('a').img['src'] crawler.then(fn1) # .then(fn2)