def input_city(): print("输入要查询的城市:") city = input() data = { "q": city, "t": "mdd", "seid": "F2D12C42-3811-4E9F-AAE7-121E9F9FE148" } url = "http://www.mafengwo.cn/search/s.php?" + urlencode(data) resp = req.get_html_content(url) bs = BeautifulSoup(resp, "lxml") s = bs.find_all("div", attrs={"class": "lst-nub"})[0].find_all("a") url1 = s[1]['href'] url2 = s[2]['href'] url3 = s[3]['href'] url4 = s[4]['href'] iMddid = str(url1).split("/")[-2] print("选择要查询的具体内容:1、所有景点 2、酒店 3、机场+酒店 4、当地游") c = int(input()) if c == 1: get_json_page(iMddid) elif c == 2: get_allcity(url2) elif c == 3: pass else: pass
def get_province_school(href): resp = req.get_html_content(href) bs = BeautifulSoup(resp, "lxml") s = bs.find_all("div", attrs={"id": "dir_content_main"})[0].find_all("td") for i in s[1:]: href = i.find('a')['href'] parse_html(href)
def get_page(href): threads = [] resp = req.get_html_content(href) bs = BeautifulSoup(resp, "lxml") s = bs.find_all("div", class_="pagination")[0].find_all('a')[-1] page = re.search(r"&pn=([0-9]*)\">尾页</a>", str(s)).group(1) pool = mp.Pool(15) for i in range(1, int(page) + 1): link = href + "&pn={}".format(i) pool.apply_async(get_province_school, args=(link, )) pool.close() pool.join()
def parse_html(href): resp = req.get_html_content(href) bs = BeautifulSoup(resp, "lxml") s = bs.find_all("div", attrs={"class": "card_title"}) for i in s: school = str(i.find("a").get_text()).strip().replace("吧", "") guanzhu = re.search(r'<span class="card_menNum">(.*)</span>', str(i)).group(1) tiezi = re.search(r'<span class="card_infoNum">(.*)</span>', str(i)).group(1) print(school, "关注:", guanzhu, "帖子:", tiezi) conn = get_connect() with conn.cursor() as cursor: sql = "insert into gz_db(school,guanzhu,tieshu) values(%s,%s,%s)" cursor.execute(sql, (school, guanzhu, tiezi)) conn.commit() cursor.close()
def get_province_name(href): resp = req.get_html_content(href) bs = BeautifulSoup(resp, "lxml") return bs.find_all("h2")[1].find("span").text
def get_province_allschool(province_school): data = {"fd": "高等院校", "ie": "utf-8", "sd": province_school} url = "http://tieba.baidu.com/f/fdir?" + urlencode(data) resp = req.get_html_content(url) return resp