Пример #1
0
    def __init__(self):
        self.start_url = "https://www.zhipin.com/"
        self.cities_code = {"深圳": "c101280600-p100109/h_101280600/", "上海": "c101020100-p100109/h_101020100/", "北京": "c101010100/h_101010100/",
                            "南京": "c101190100/h_101190100/", "杭州": "c101210100/h_101210100/"}  # 北京,南京,杭州
        self.headers = {
		    'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'Accept-Encoding': "gzip, deflate, br",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Cache-Control': "no-cache",
            'Connection': "keep-alive",
            'Cookie': "sid=sem_pz_bdpc_dasou_title; JSESSIONID=""; __g=sem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339; __c=1539076344; __l=r=https%3A%2F%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&l=%2Fwww.zhipin.com%2Fjob_detail%2F%3Fka%3Dheader-job&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_ti; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539130799; __a=1223038.1539076337.1539076337.1539076344.24.2.23.24",
            'Host': "www.zhipin.com",
            'Pragma': "no-cache",
            'Upgrade-Insecure-Requests': "1",
            "User-Agent": random.choice(UserAgents.agents),
            #请求伪造,可以试一下跨站请求伪造CSRF,Referer表示访问请求来源
            'Referer': 'https://www.zhipin.com/c101280600-p100109/h_101280600/?query=python&page=1&ka=page-1'
        }
        
        pool = redis.ConnectionPool(host='localhost', port=6379)
        self.conn = redis.Redis(connection_pool=pool)
        self.proxy_pool=ProxyPool.Proxy_Pool()
        self.proxies = []
        self.ip=[]
        if not self.proxy_pool.Is_Empty():
           self.ip,self.proxies=self.proxy_pool.pop_all()
Пример #2
0
 def __init__(self, jid, lid):
     self.jid = jid  #json请求参数
     self.lid = lid  #json请求参数
     self.url = "https://www.zhipin.com/view/job/card.json?jid=" + str(
         self.jid) + "&lid=" + str(self.lid)
     #https://www.zhipin.com/view/job/card.json?jid=2339af182b9be5111XB70t2_GVQ~&lid=17qKeuLoGkf.search
     self.headers = {
         "Host":
         "www.zhipin.com",
         "Connection":
         "keep-alive",
         "Pragma":
         "no-cache",
         "Cache-Control":
         "no-cache",
         "Accept":
         "application/json, text/javascript, */*; q=0.01",
         "X-Requested-With":
         "XMLHttpRequest",
         "User-Agent":
         random.choice(UserAgents.agents),
         #"Referer": https://www.zhipin.com/c101190100/h_101190100/?query=python&page=2&ka=page-2
         "Accept-Encoding":
         "gzip, deflate, br",
         "Accept-Language":
         "zh-CN,zh;q=0.9",
         "Cookie":
         "sid=sem_pz_bdpc_dasou_title; JSESSIONID="
         "; __c=1539227402; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3DUTF-8%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339,1539152693,1539227402; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2%26ka%3Dpage-2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539249307; __a=1223038.1539076337.1539076344.1539227402.57.3.21.21"
     }
     proxy_pool = ProxyPool.Proxy_Pool()
     self.proxies = []  #代理IP列表
     if not proxy_pool.Is_Empty():
         ip, self.proxies = proxy_pool.pop_all()
Пример #3
0
    def getHTMLText(self, code="utf-8"):
        if not self.parseURL():
            return
        if self.cache:
            self.html = self.cache[self.url]
            if not self.html:
                p_p = ProxyPool.Proxy_Pool()
                proxy = db.proxy
                tag = True
                while tag:
                    proxies = proxy.find_one()
                    if proxies == None:
                        ProxyGetter.get_ip()
                    one_p = str(proxies['类型'])
                    two_p = str(proxies['IP'])
                    three_p = str(proxies['PORT'])
                    #print(one_p)
                    #print(type(one_p))
                    flag = p_p.test_connection(
                        one_p, two_p, three_p)  ##########################
                    if flag == False:
                        p_p.del_record(proxies['IP'])
                    #proxies = proxy.find_one()
                    else:
                        tag = False
                proxy_ip = {
                    str(proxies['类型']):
                    str(proxies['IP']) + ":" + str(proxies['PORT'])
                }
                try:

                    ua = {
                        'user-agent':
                        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
                    }
                    r = requests.get(self.url, headers=ua, proxies=proxy_ip)
                    r.raise_for_status()
                    r.encoding = code
                    self.html = r.text
                    self.cache[self.url] = self.html
            #p_p.clean_nonworking()
                except:
                    #p_p.clean_nonworking()
                    pass
Пример #4
0
sp=Spider()
sp.updateDatabase()#更新数据库
#start_url=input("请输入目标网站首页地址,例如 http://www.xiaohuar.com/  :")
#if parseURL(start_url)==False:
#    exit(1)
start_url="http://www.xiaohuar.com/"

#start_url=u"http://www.baidu.com/"

sp.main(start_url,start_url)
spider_schedule=SpiderSchedule.SpiderSchedule()

downloads = downloaderware()
threads=[]

p_p=ProxyPool.Proxy_Pool()
proxy_counter=1#代理池更新计数器
threads_counter=0#线程计数器
t0 = threading.Thread(target=spider_schedule.SpiderSchedule,args=(start_url,))
t0.start()
threads.append(t0)
while threads and threads_counter<=50:

    # the crawl is still active
    for thread in threads:
        if not thread.is_alive():
            # remove the stopped threads
            threads.remove(thread)

        t2 = threading.Thread(target=downloads.downloaderware, args=(start_url,))
        t2.start()