def crawl_job(self): count = 0 now = datetime.now() current_time = now.strftime("%H:%M %d-%m-%Y") print('\n====== Starting Crawling at : ' + str(current_time) + ' ==============================\n') count_prox = 1 if self.num_prox > 0: proxy = prox.Proxies(number_of_proxies=1).getProxiesAllInOne()[0] for url in self.stations_url: # Get name name = url.split('stations')[-1].replace('/', '') # Get Proxy { ip: xxxx , port: xxxxx} if self.num_prox > 0 and count_prox % self.num_prox == 0: print("Searching for new proxy") proxy = prox.Proxies( number_of_proxies=1).getProxiesAllInOne()[0] count_prox += 1 # create Instance for crawl cs = cS.Crawl_Station(url, proxy=proxy) # Get Data data = cs.getInfo() if len(data) >= 6: count += 1 #print(data) data['TimeCrawled'] = now.strftime("%H:%M") self.writeCSV(name, data) #time.sleep(5) now = datetime.now() current_time = now.strftime("%H:%M %d-%m-%Y") print('\n====== Crawling Ended at ' + current_time + ' ==============================\n') self.writeLogFile(current_time, count)
"Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644" .format(timeStamp=timeStamp, time=time1), "Referer": "https://m.lagou.com/search.html", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } city = "广州" positionName = "python" # pageNo = "1" pageSize = "15" proxies = Proxies() def get_detail_url(pageNo, proxies): base_url = "https://m.lagou.com/search.json?city={city}&positionName={positionName}&pageNo={pageNo}&" \ "pageSize={pageSize}".format(city=city,positionName=positionName,pageNo=pageNo,pageSize=pageSize) res = requests.get(base_url, headers=headers, proxies=proxies) content = res.content.decode() dict1 = json.loads(content) # print(dict1) list1 = dict1['content']['data']['page']['result'] for i in list1: yield "https://m.lagou.com/jobs/{}.html".format(i['positionId']) # 职位名称 薪资 工作地点 工作年限 学历要求 企业名字 职位描述