def craw(self): print('crawling job info from zhilian') for url in self.get_url(): downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() items = soup.find("div", id="newlist_list_content_table").find_all( "table", class_="newlist") items.remove(items[0]) for item in items: tmp_dict = {} tmp_dict['media'] = '智联' tmp_dict['jobname'] = item.find( "td", class_="zwmc").find("a").get_text().strip() tmp_dict['joblink'] = item.find( "td", class_="zwmc").find("a").get("href") tmp_dict['company'] = item.find( "td", class_="gsmc").find("a").get_text().strip() tmp_dict['location'] = item.find( "td", class_="gzdd").get_text().strip() tmp_dict['salary'] = item.find( "td", class_="zwyx").get_text().strip() self.data.append(tmp_dict) time.sleep(3) print('...got %d job info items from zhilian' % len(self.data)) print(self.data) return self.data
def get_url(self): url = 'http://www.chinahr.com/sou/?orderField=relate&keyword=Java+Python+PHP+.NET+C%23+C%2B%2B+C+Delphi+Perl+Ruby+Hadoop+Node.js+MySQL+SQLServer&city=39,416;39,417;39,421&industrys=1100&page=1' downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() max_page = int(soup.find("div", class_="pageList").find_all("a")[-2].get_text()) curr_page = 1 urls = [] while curr_page <= max_page: urls.append(re.sub("page=\d", "page=%d" % curr_page, url)) curr_page += 1 print('...got %d pages about job info from chinahr' % len(urls)) return urls
def get_url(self): url = 'http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=280800%2C280200%2C280400%2C00&district=000000&funtype=2600%2C2500%2C0100&industrytype=01%2C38%2C32&issuedate=9&providesalary=99&keywordtype=2&curr_page=1&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9' downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() max_page = soup.find("div", class_="dw_page").find("span", class_="td").get_text() max_page = int(re.sub("\D", "", max_page)) # print(max_page) curr_page = 1 urls = [] while curr_page <= max_page: urls.append(re.sub("curr_page=\d", "curr_page=%d" % curr_page, url)) curr_page += 1 print('...got %d pages about job info from 51job' % len(urls)) return urls
def craw(self): print('crawling job info from chinahr') for url in self.get_url(): downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() items = soup.find("div", {"class": "resultList"}).find_all("div", class_="jobList") for item in items: tmp_dict = {} tmp_dict['media'] = '中华英才' tmp_dict['jobname'] = item.find("li", class_="l1").find("span", class_="e1").find("a").get_text().strip() tmp_dict['joblink'] = item.find("li", class_="l1").find("span", class_="e1").find("a").get("href") tmp_dict['company'] = item.find("li", class_="l1").find("span", class_="e3").find("a").get_text().strip() tmp_dict['location'] = item.find("li", class_="l2").find("span", class_="e1").get_text().split(']')[0].replace('[', '').strip() tmp_dict['salary'] = item.find("li", class_="l2").find("span", class_="e2").get_text().strip() self.data.append(tmp_dict) time.sleep(3) print('...got %d job info items from chinahr' % len(self.data)) print(self.data) return self.data
def craw(self): print('crawling job info from 51job') for url in self.get_url(): downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() items = soup.find("div", {"id": "resultList"}).find_all("div", class_="el") items.remove(items[0]) for item in items: tmp_dict = {} tmp_dict['media'] = '前程无忧' tmp_dict['jobname'] = item.find("p", class_="t1").find("a").get_text().strip() tmp_dict['joblink'] = item.find("p", class_="t1").find("a").get("href") tmp_dict['company'] = item.find("span", class_="t2").find("a").get_text() tmp_dict['location'] = item.find("span", class_="t3").get_text() tmp_dict['salary'] = item.find("span", class_="t4").get_text() self.data.append(tmp_dict) print('...got %d job info items from 51job' % len(self.data)) print(self.data) return self.data
def craw(self): print('crawling job info from liepin') for url in self.get_url(): downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() # print(html_cont.decode('utf-8')) parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() items = soup.find("div", {"class": "sojob-result"}).find_all("li") for item in items: tmp_dict = {} tmp_dict['media'] = '猎聘' tmp_dict['jobname'] = item.find("div", class_="job-info").find(["span", "h3"]).get("title") tmp_dict['joblink'] = item.find("div", class_="job-info").find(["span", "h3"]).find("a").get("href") tmp_dict['company'] = item.find("div", class_="company-info").find("p", class_="company-name").find("a").get_text().strip() tmp_dict['location'] = item.find("div", class_="job-info").find("p", class_="condition").find("a", class_="area").get_text().strip() tmp_dict['salary'] = item.find("div", class_="job-info").find("p", class_="condition").find("span", class_="text-warning").get_text().strip() self.data.append(tmp_dict) time.sleep(5) print('...got %d job info items from liepin' % len(self.data)) print(self.data) return self.data
def craw(self): print('crawling job info from ganji') for url in self.get_url(): downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() items = soup.select("dl.list-noimg.job-list") for item in items: tmp_dict = {} tmp_dict['media'] = '赶集' tmp_dict['jobname'] = item.find("dt").find( "a").get_text().strip() tmp_dict['joblink'] = item.find("dt").find("a").get("href") tmp_dict['company'] = item.find( "dd", class_="company").find("a").get_text() tmp_dict['location'] = item.find("dd", class_="pay").get_text() tmp_dict['salary'] = '未知' self.data.append(tmp_dict) time.sleep(3) print('...got %d job info items from ganji' % len(self.data)) print(self.data) return self.data
def craw(self): print('crawling job info from 58') for url in self.get_url(): downloader = htmldownloader.HtmlDownLoader(url) html_cont = downloader.download() parser = htmlparser.HtmlParser(html_cont) soup = parser.get_soup() items = soup.find("div", id="infolist").find_all("dl") for item in items: tmp_dict = {} tmp_dict['media'] = '58同城' tmp_dict['jobname'] = item.find("dt").find( "a").get_text().strip() tmp_dict['joblink'] = item.find("dt").find("a").get("href") tmp_dict['company'] = item.find( "dd", class_="w271").find("a").get_text() tmp_dict['location'] = item.find("dd", class_="w96").get_text() tmp_dict['salary'] = '未知' self.data.append(tmp_dict) time.sleep(3) print('...got %d job info items from 58' % len(self.data)) print(self.data) return self.data