def __init__(self, threadNum): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} self.URL = "http://www.lagou.com/" self.position = [] # 使用元祖储存职业名 self.q_req = Queue() self.threadNum = threadNum self.lagou_db = LGDB()
class LG: def __init__(self, threadNum): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} self.URL = "http://www.lagou.com/" self.position = [] # 使用元祖储存职业名 self.q_req = Queue() self.threadNum = threadNum self.lagou_db = LGDB() def getPosition(self): pageCode = self.getPageCode(self.URL) query = PyQuery(pageCode) positionData = query(".menu_sub.dn .reset dd a") for i in range(positionData.length): data = positionData.eq(i) name = data.text() link = data.attr("href") self.position.append((name, link)) print("获取职业列表成功!") return self.position def getPageCode(self, url): time.sleep(random.randint(0,5)) try: return requests.get(url, headers=self.headers).content.decode('utf-8') except Exception: print("*******连接有误********") return None def getJobList(self, kd): # 判断职位是否已被记录过 if self.lagou_db.isRecordJobName(kd): print("数据库已记录过", kd) return None jobsId = [] for index in range(1, 31): data = {'kd': kd, 'pn': index} time.sleep(1) jsonData = requests.post( "http://www.lagou.com/jobs/positionAjax.json?", data=data, headers=self.headers ) jobs = jsonData.json()["content"]["result"] if not len(jobs): break print("开始获取%s的数据第%d页的%d条数据" % (kd, index, len(jobs))) # 遍历数据,并为其加入主键 for job in jobs: # 将主键加入到job中, 并加入到数据库中 id = job['positionId'] job["_id"] = id job["companyLogo"] = "http://www.lagou.com/" + job["companyLogo"] jobsId.append(id) self.lagou_db.addJob(job) print("%s的职位录入完毕!一共%d条数据" % (kd, len(jobsId))) # 若录入完毕后,将其存到数据库中,下一次将不再获取 self.lagou_db.recordToSave(kd) def workingThread(self): while True: kd = self.q_req.get() self.getJobList(kd) time.sleep(1) self.q_req.task_done() def run(self): # 先获取职业列表和链接 position = self.getPosition() for name, url in position: self.q_req.put(name) for i in range(self.threadNum): t = Thread(target=self.workingThread) t.setDaemon(True) t.start() self.q_req.join()