def __init__(self, jid, lid): self.jid = jid #json请求参数 self.lid = lid #json请求参数 self.url = "https://www.zhipin.com/view/job/card.json?jid=" + str( self.jid) + "&lid=" + str(self.lid) #https://www.zhipin.com/view/job/card.json?jid=2339af182b9be5111XB70t2_GVQ~&lid=17qKeuLoGkf.search self.headers = { "Host": "www.zhipin.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "User-Agent": random.choice(UserAgents.agents), #"Referer": https://www.zhipin.com/c101190100/h_101190100/?query=python&page=2&ka=page-2 "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "sid=sem_pz_bdpc_dasou_title; JSESSIONID=" "; __c=1539227402; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3DUTF-8%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339,1539152693,1539227402; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2%26ka%3Dpage-2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539249307; __a=1223038.1539076337.1539076344.1539227402.57.3.21.21" } proxy_pool = ProxyPool.Proxy_Pool() self.proxies = [] #代理IP列表 if not proxy_pool.Is_Empty(): ip, self.proxies = proxy_pool.pop_all()
def __init__(self): self._start = 0 self._end = 0 self._keyword = '' self._timeout = 0 self._recon = 0 self._filename = '' self._urls = [] self._re_urls = [] self._pool = ProxyPool.ProxyPool() self._searchURL = '' self._punc = ',.!?:;~\'\",。!?:;、~…⋯()<>「」[]【】<>〈〉《》()﹙﹚『』«»“”’{}\\[\\]' # the '[]' needs to be the last one self._stop_words = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄧㄨㄩㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦ \ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \ ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ \ 12345678901234567890ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!?©@#$%^&*.…⋯→‧•◎※■+・ˇˋˊ˙ \ ()_+=-\\[]/,.;:`~|{}<>\'\"\n\t\r\xa0,。、!?「」[]【】<>〈〉《》():;«»*˙●/_—『』×@#$%︿&-=〜~≡|│║★☆Ⓡ➠†§– \ ♥❤“”’ ̄▽😊😆😋😏😅😀😍😎📍👍🚫🐍💟🎉⊙◢◤˚゚・。`↑↓﹙﹚▲▼◆◈▣✥▒👉►⓪①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬〝〞▌☀ღ▶➦ⓞ☎▋♡▂▃▄▅▆▊▩⇓✽�🕘㊣╳' self._sw_no_punc = re.sub('([{}])'.format(self._punc), '', self._stop_words) self._sw_dict = {w : True for w in self._stop_words} self._sw_no_punc_dict = {w : True for w in self._sw_no_punc} self._websites = {'Unknown': -1, 'Pixnet': 0, 'Hares': 1} self._title_tags = ['title', 'entry-title'] # {0: 'Pixnet', 1: 'Hares} self._content_tags = ['article-content-inner', 'entry-content'] # {0: 'Pixnet', 1: 'Hares} self._result = ''
def __init__(self): self.start_url = "https://www.zhipin.com/" self.cities_code = {"深圳": "c101280600-p100109/h_101280600/", "上海": "c101020100-p100109/h_101020100/", "北京": "c101010100/h_101010100/", "南京": "c101190100/h_101190100/", "杭州": "c101210100/h_101210100/"} # 北京,南京,杭州 self.headers = { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'Accept-Encoding': "gzip, deflate, br", 'Accept-Language': "zh-CN,zh;q=0.9", 'Cache-Control': "no-cache", 'Connection': "keep-alive", 'Cookie': "sid=sem_pz_bdpc_dasou_title; JSESSIONID=""; __g=sem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339; __c=1539076344; __l=r=https%3A%2F%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&l=%2Fwww.zhipin.com%2Fjob_detail%2F%3Fka%3Dheader-job&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_ti; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539130799; __a=1223038.1539076337.1539076337.1539076344.24.2.23.24", 'Host': "www.zhipin.com", 'Pragma': "no-cache", 'Upgrade-Insecure-Requests': "1", "User-Agent": random.choice(UserAgents.agents), #请求伪造,可以试一下跨站请求伪造CSRF,Referer表示访问请求来源 'Referer': 'https://www.zhipin.com/c101280600-p100109/h_101280600/?query=python&page=1&ka=page-1' } pool = redis.ConnectionPool(host='localhost', port=6379) self.conn = redis.Redis(connection_pool=pool) self.proxy_pool=ProxyPool.Proxy_Pool() self.proxies = [] self.ip=[] if not self.proxy_pool.Is_Empty(): self.ip,self.proxies=self.proxy_pool.pop_all()
async def __process_fourth_html(self): ''' foreign website :return: ''' page_count = 6 urls = [ self.__proxy_urls[3].format(_i + 1) for _i in range(page_count) ] tasks = [self.__get_html(url) for url in urls] done = await asyncio.gather(*tasks) import base64 for html in done: proxies = re.findall(r"Proxy\('(.*?)'\)", html) for proxy in proxies: temp = base64.b64decode(proxy).decode() ip = re.findall('\d+\.\d+\.\d+\.\d+', temp) ip = ip[0] port = re.findall(':\d+', temp) port = port[0] port = port[1:] new = ProxyPool.proxy(ip, port) if new not in self.__pool: self.__pool.append(new) print( f'process_fourth_html got {new.get_string_address()}')
async def __process_third_html(self): page_count = 2 urls = [ self.__proxy_urls[2].format(_i + 1) for _i in range(page_count) ] tasks = [self.__get_html(url) for url in urls] done = await asyncio.gather(*tasks) for html in done: if html: soup = BeautifulSoup(html, features='html5lib') res = soup.find_all('td') for _i, item in enumerate(res): str_ = str(item.string) temp = re.findall('\d+\.\d+\.\d+\.\d+', str_) if temp: ip = temp[0] port = res[_i + 1].string port = str(port) port = port.strip() new = ProxyPool.proxy(ip, port) if new not in self.__pool: self.__pool.append(new) print( f'process_third_html got {new.get_string_address()}' )
async def __process_first_html(self): # tasks = [self.__get_html(self.__proxy_urls[0].format(_i)) for _i in range(1, 4)] page_count = 4 done = [] for _i in range(1, page_count): url = self.__proxy_urls[0].format(_i) html = await self.__get_html(url) done.append(html) time.sleep(1.148) # done = await asyncio.gather(*tasks) for html in done: if html: soup = BeautifulSoup(html, features="html5lib") ips = soup.find_all( 'td', { 'data-title': 'IP' } ) ports = soup.find_all( 'td', { 'data-title': 'PORT' } ) proxies = zip(ips, ports) # print(len(ips)) # os._exit(-1) for item in proxies: proxy = ProxyPool.proxy(item[0].string, item[1].string) if proxy not in self.__pool: self.__pool.append(proxy) print(f'process_first_html got {proxy.get_string_address()}')
def __init__(self, exit_pipe, count_pipe, data_pipe): self.pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT) self.r_con = redis.Redis(connection_pool=self.pool) self.ua = UserAgent() self.exit_flag = False # 退出信号 self.exit_pipe = exit_pipe self.count_pipe = count_pipe self.data_pipe = data_pipe self.crawl_threads = [] # 爬取线程 self.data_format = [ "uid", "user_name", "user_sign", "gender", "level", "birthday", "coins", "vip", "favorite_list", "favorite_sum", "follow", 'fans', "live_title", "audio", "video", "album", "article", "play_count", "read_count", "time" ] self.pages_count = 0 self.items_count = 0 self.redis = ProxyPool.ProxyAPI() self.run()
def makeSiteThreads(): import pkgutil import sites # load proxy file ProxyPool.restore() sites_modules = [] for importer, modname, ispkg in pkgutil.iter_modules(sites.__path__): sites_modules.append(importer.find_module(modname).load_module(modname)) site_threads = [] for m in sites_modules: t = siteThread(m.__name__, m.crawl) t.deamon = True site_threads.append(t) return site_threads
def set_proxy(): proxy = ProxyPool.get_proxy_ip() settings = { "httpProxy": proxy, "sslProxy": proxy } print('proxy_ip', proxy) proxy = Proxy(settings) cap = DesiredCapabilities.CHROME.copy() cap['platform'] = "WINDOWS" cap['version'] = "10" proxy.add_to_capabilities(cap) return cap
def __universal_soup(self, flag:str, html): res = html soup = BeautifulSoup(res, features='html5lib') ipandport = soup.find_all('td') for i, ip in enumerate(ipandport): temp = re.findall('\d+\.\d+\.\d+\.\d+', ip.string) if temp: port = ipandport[i + 1].string port = port.strip() new = ProxyPool.proxy(temp[0], port) if new not in self.__pool: self.__pool.append(new) print(f'process_{flag}_html got {new.get_string_address()}')
def getHTMLText(self, code="utf-8"): if not self.parseURL(): return if self.cache: self.html = self.cache[self.url] if not self.html: p_p = ProxyPool.Proxy_Pool() proxy = db.proxy tag = True while tag: proxies = proxy.find_one() if proxies == None: ProxyGetter.get_ip() one_p = str(proxies['类型']) two_p = str(proxies['IP']) three_p = str(proxies['PORT']) #print(one_p) #print(type(one_p)) flag = p_p.test_connection( one_p, two_p, three_p) ########################## if flag == False: p_p.del_record(proxies['IP']) #proxies = proxy.find_one() else: tag = False proxy_ip = { str(proxies['类型']): str(proxies['IP']) + ":" + str(proxies['PORT']) } try: ua = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36' } r = requests.get(self.url, headers=ua, proxies=proxy_ip) r.raise_for_status() r.encoding = code self.html = r.text self.cache[self.url] = self.html #p_p.clean_nonworking() except: #p_p.clean_nonworking() pass
async def __process_fifth_html(self): ''' forbidden website :return: ''' proxy_count = 20 url = self.__proxy_urls[4].format(proxy_count) html = await self.__get_html(url) print(html) items = re.findall('\d+\.\d+\.\d+\.\d+:\d{1,5}', html) for item in items: ip = re.findall('\d+\.\d+\.\d+\.\d+', item) ip = ip[0] port = item.replace(ip, "") port = port[1:] new = ProxyPool.proxy(ip, port) if new not in self.__pool: self.__pool.append(new) print(f'process_fifth_html got {new.get_string_address()}')
async def __process_second_html(self): urls = self.__proxy_urls[1] tasks = [self.__get_html(url) for url in urls] done = await asyncio.gather(*tasks) for res in done: if res: soup = BeautifulSoup(res, features='html5lib') ipandport = soup.find_all('td') for i, ip in enumerate(ipandport): temp = re.findall('\d+\.\d+\.\d+\.\d+', ip.string) if temp: port = ipandport[i + 1].string port = port.strip() new = ProxyPool.proxy(temp[0], port) if new not in self.__pool: self.__pool.append(new) print( f'process_second_html got {new.get_string_address()}' )
async def __process_sixth_html(self): page_count = 5 urls = [ self.__proxy_urls[5].format(_i + 1) for _i in range(page_count) ] tasks = [self.__get_html(url) for url in urls] done = await asyncio.gather(*tasks) for html in done: if html: soup = BeautifulSoup(html, features='html5lib') ipandport = soup.find_all('td') for i, ip in enumerate(ipandport): temp = re.findall('\d+\.\d+\.\d+\.\d+', ip.string) if temp: port = ipandport[i + 1].string port = port.strip() new = ProxyPool.proxy(temp[0], port) if new not in self.__pool: self.__pool.append(new) print( f'process_sixth_html got {new.get_string_address()}' )
import sys sys.path.append('..') sys.path.append('.') import ProxyPool if __name__ == '__main__': pool = ProxyPool.proxy_pool() pool.start_work()
while temp: temp = bytes.decode(temp) info.append(temp) temp = self.__redis.rpop('decrease') feedback['decrease'] = info return feedback if __name__ == '__main__': test = database() proxy = 'http://119.254.94.114:34422' # test.proxy_feedback(proxy, True) # a = ProxyPool.proxy('192.141.32.2', '2367') b = ProxyPool.proxy('111.23.214.123', '23') c = ProxyPool.proxy('111.23.214.123', '231') test.add_proxies([a, b, c]) # test.delete_proxies([a, b]) # test.add_proxies([a, b, c]) # test.add_proxies([a, b, c]) # # test.proxy_feedback(a.get_string_address(), True) # test.proxy_feedback(a.get_dict_address(), False) # test.proxy_feedback(b.get_dict_address(), True) # # print(test.get_feedback()) # temp = test.get_all_increase()
sp=Spider() sp.updateDatabase()#更新数据库 #start_url=input("请输入目标网站首页地址,例如 http://www.xiaohuar.com/ :") #if parseURL(start_url)==False: # exit(1) start_url="http://www.xiaohuar.com/" #start_url=u"http://www.baidu.com/" sp.main(start_url,start_url) spider_schedule=SpiderSchedule.SpiderSchedule() downloads = downloaderware() threads=[] p_p=ProxyPool.Proxy_Pool() proxy_counter=1#代理池更新计数器 threads_counter=0#线程计数器 t0 = threading.Thread(target=spider_schedule.SpiderSchedule,args=(start_url,)) t0.start() threads.append(t0) while threads and threads_counter<=50: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) t2 = threading.Thread(target=downloads.downloaderware, args=(start_url,)) t2.start()
def run(self): print "starting proxy dump thread" while True: ProxyPool.dump() print "[Deamon] Dumped proxies" time.sleep(30)
def run(self): print "starting proxy pools" while True: ProxyPool.cleanNonWorking()