# Build a basic request header basicHeaders = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } # Build a xml request header xmlHeaders = basicHeaders['x-requested-with'] = "XMLHttpRequest" requester = Requester.Requester(header=basicHeaders, useProxyPool=useProxyPool, useProxyFun=useProxyFun, ipPool=ipPool) xmlRequester = Requester.Requester(header=xmlHeaders, useProxyPool=useProxyPool, useProxyFun=useProxyFun, ipPool=ipPool) pool = threadpool.ThreadPool(threadNum) # Seedlist - a list of IDs r = range(4345000, 4345003) taskList = [] for id in r: url = 'https://dribbble.com/shots/%d' % id taskList.append(url)
print(e) print("数据库链接失败!检查~") exit() basicHeaders = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } requester = Requester.Requester(basicHeaders, useProxy, ipPool) # 爬取二级类目列表 def crawlSecondaryUrlList(rootUrl, rootCategory): print("开始解析二级类目列表...") rootHtml = requester.sendNewRequest( rootUrl, limit=3).response.content.decode("utf-8") soup = BeautifulSoup(rootHtml, "lxml") soup.prettify() leftNavigation = soup.find('div', attrs={'role': 'region'}) if leftNavigation: secondaryUrlInfoReg = re.compile('<li><a href="(.*?)">(.*?)</a></li>') secondaryUrlInfoRegSearch = re.findall(secondaryUrlInfoReg,
def __init__(self, mediaDirectory): self.mediaDirectory = mediaDirectory self.requester = Requester(self.mediaDirectory) self.db = SelfieDB()