def __init__(self): self.start_url = "https://www.zhipin.com/" self.cities_code = {"深圳": "c101280600-p100109/h_101280600/", "上海": "c101020100-p100109/h_101020100/", "北京": "c101010100/h_101010100/", "南京": "c101190100/h_101190100/", "杭州": "c101210100/h_101210100/"} # 北京,南京,杭州 self.headers = { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'Accept-Encoding': "gzip, deflate, br", 'Accept-Language': "zh-CN,zh;q=0.9", 'Cache-Control': "no-cache", 'Connection': "keep-alive", 'Cookie': "sid=sem_pz_bdpc_dasou_title; JSESSIONID=""; __g=sem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339; __c=1539076344; __l=r=https%3A%2F%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&l=%2Fwww.zhipin.com%2Fjob_detail%2F%3Fka%3Dheader-job&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_ti; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539130799; __a=1223038.1539076337.1539076337.1539076344.24.2.23.24", 'Host': "www.zhipin.com", 'Pragma': "no-cache", 'Upgrade-Insecure-Requests': "1", "User-Agent": random.choice(UserAgents.agents), #请求伪造,可以试一下跨站请求伪造CSRF,Referer表示访问请求来源 'Referer': 'https://www.zhipin.com/c101280600-p100109/h_101280600/?query=python&page=1&ka=page-1' } pool = redis.ConnectionPool(host='localhost', port=6379) self.conn = redis.Redis(connection_pool=pool) self.proxy_pool=ProxyPool.Proxy_Pool() self.proxies = [] self.ip=[] if not self.proxy_pool.Is_Empty(): self.ip,self.proxies=self.proxy_pool.pop_all()
def __init__(self, jid, lid): self.jid = jid #json请求参数 self.lid = lid #json请求参数 self.url = "https://www.zhipin.com/view/job/card.json?jid=" + str( self.jid) + "&lid=" + str(self.lid) #https://www.zhipin.com/view/job/card.json?jid=2339af182b9be5111XB70t2_GVQ~&lid=17qKeuLoGkf.search self.headers = { "Host": "www.zhipin.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "User-Agent": random.choice(UserAgents.agents), #"Referer": https://www.zhipin.com/c101190100/h_101190100/?query=python&page=2&ka=page-2 "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "sid=sem_pz_bdpc_dasou_title; JSESSIONID=" "; __c=1539227402; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3DUTF-8%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339,1539152693,1539227402; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2%26ka%3Dpage-2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539249307; __a=1223038.1539076337.1539076344.1539227402.57.3.21.21" } proxy_pool = ProxyPool.Proxy_Pool() self.proxies = [] #代理IP列表 if not proxy_pool.Is_Empty(): ip, self.proxies = proxy_pool.pop_all()
def getHTMLText(self, code="utf-8"): if not self.parseURL(): return if self.cache: self.html = self.cache[self.url] if not self.html: p_p = ProxyPool.Proxy_Pool() proxy = db.proxy tag = True while tag: proxies = proxy.find_one() if proxies == None: ProxyGetter.get_ip() one_p = str(proxies['类型']) two_p = str(proxies['IP']) three_p = str(proxies['PORT']) #print(one_p) #print(type(one_p)) flag = p_p.test_connection( one_p, two_p, three_p) ########################## if flag == False: p_p.del_record(proxies['IP']) #proxies = proxy.find_one() else: tag = False proxy_ip = { str(proxies['类型']): str(proxies['IP']) + ":" + str(proxies['PORT']) } try: ua = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36' } r = requests.get(self.url, headers=ua, proxies=proxy_ip) r.raise_for_status() r.encoding = code self.html = r.text self.cache[self.url] = self.html #p_p.clean_nonworking() except: #p_p.clean_nonworking() pass
sp=Spider() sp.updateDatabase()#更新数据库 #start_url=input("请输入目标网站首页地址,例如 http://www.xiaohuar.com/ :") #if parseURL(start_url)==False: # exit(1) start_url="http://www.xiaohuar.com/" #start_url=u"http://www.baidu.com/" sp.main(start_url,start_url) spider_schedule=SpiderSchedule.SpiderSchedule() downloads = downloaderware() threads=[] p_p=ProxyPool.Proxy_Pool() proxy_counter=1#代理池更新计数器 threads_counter=0#线程计数器 t0 = threading.Thread(target=spider_schedule.SpiderSchedule,args=(start_url,)) t0.start() threads.append(t0) while threads and threads_counter<=50: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) t2 = threading.Thread(target=downloads.downloaderware, args=(start_url,)) t2.start()