def parse_link(url, mongo_table): # url = 'https://www.lagou.com/zhaopin/Python/' for page in range(1, 31): link = '{}{}/?filterOption=3'.format(url, str(page)) resp = requests.get(link, headers=config.head()) if resp.status_code == 404: pass else: soup = BeautifulSoup(resp.text, 'lxml') pos_link = config.position() # 链接 sel = list(map(soup.select, pos_link)) # 筛选 data = zip(*sel) # 转置 for position, add, release_time, money, need, company, tag, welfare in data: data = { 'position': position.get_text(), 'add': add.get_text(), 'release_time': release_time.get_text(), 'money': money.get_text(), 'need': need.get_text().split('\n')[2], 'company': company.get_text(), 'tag': tag.get_text().replace('\n', '-'), 'welfare': welfare.get_text() } save_database(data, mongo_table)
def parse_link(url): """内容页爬取字段""" # headers = config.random_header() headers = config.head() resp = requests.get(url, headers=headers) time.sleep(10) if resp.status_code == 404: pass else: soup = BeautifulSoup(resp.text, 'lxml') pos_link = config.position() # 链接 sel = list(map(soup.select, pos_link)) # 筛选 for position, region, release_time, money, need, company, tag, welfare, industry in zip( *sel): position = position.get_text() city = region.get_text().split('·')[0] area = region.get_text().split('·')[1] release_time = release_time.get_text() money = money.get_text() need = need.get_text().split('\n')[2] company = company.get_text() tag = tag.get_text().replace('\n', '-') welfare = welfare.get_text() industry = industry.get_text().replace('\n', '').replace(' ', '') info = [ position, city, area, release_time, money, need, company, tag, welfare, industry ] print(info) thread_lock.release() #解锁
def save_img(url, name): url = "http:{}".format(url) if not os.path.isdir("img"): os.mkdir("img") img_path = os.path.join(os.getcwd(), "img") r = requests.get(url, headers=config.head(config.img_temp)) with open(os.path.join(img_path, name), 'wb') as f: f.write(r.content) print(name, " done") time.sleep(3)
def test(page): t = str(int(time.time()))[:-2] + '558' head = config.head() url = config.urldemo.format(str(page), t) r = requests.get(url, headers=head) j = json.loads(r.text) for i in j["data"]["pic_list"]: try: or_picurl = i['original_pic'] name = str(i['mid']) + '_' + i['sub_name'].replace( ":", '') + '.' + or_picurl[-3:] except: continue if os.path.isfile(os.path.join(os.getcwd(), "img", name)): print("{} 已存在 跳过爬取".format(name)) continue save_img(or_picurl, name)
def parse_link(url): """内容页爬取字段""" # resp = requests.get(url, headers=headers, proxies=proxies) # headers = config.random_header() headers = config.head() print(headers) resp = requests.get(url, headers=headers) if resp.status_code == 404: pass else: soup = BeautifulSoup(resp.text, 'lxml') pos_link = config.position() # 链接 sel = list(map(soup.select, pos_link)) # 筛选 for position, region, release_time, money, need, company, tag, welfare, industry in zip( *sel): position = position.get_text() # region = region.get_text() city = region.get_text().split('·')[0] area = region.get_text().split('·')[1] release_time = release_time.get_text() money = money.get_text() need = need.get_text().split('\n')[2] company = company.get_text() tag = tag.get_text().replace('\n', '-') welfare = welfare.get_text() industry = industry.get_text().replace('\n', '').replace(' ', '') # print(position, type(position)) # print(city, type(city)) # print(area, type(area)) # print(release_time, type(release_time)) # print(money, type(money)) # print(need, type(need)) # print(company, type(company)) # print(tag, type(tag)) # print(welfare, type(welfare)) # print(industry, type(industry)) # print('*'*120) db.insert_position(position, city, area, release_time, money, need, company, tag, welfare, industry)