def run(): cities = ["hz", "cd", "bj", "sh", "hui", "nb"] maxpage = [100, 100, 21, 100, 39, 35] for city, max_pageid in zip(cities, maxpage): house_data = pd.DataFrame(columns=("name", "price", "year", "x", "y", "house_type_num", "house_structure_area")) csv_path = "./data/newhouse_lianjia_new_" + city + ".csv" head_url = "https://" + city + ".fang.lianjia.com" for page_id in range(1, max_pageid + 1): url = head_url + "/loupan/nhs1pg" + str(page_id) try: html = get_html(url) hrefs = get_hrefs(html, url_head="https://hz.fang.lianjia.com") except Exception: print("-------------------big error-------------------") print(url) time.sleep(60) continue for href in hrefs: try: house_html = get_html(href) name = get_name(house_html) price = get_price(house_html) xy = get_xy(house_html) structure = get_structure( house_html, url_head="https://hz.fang.lianjia.com") year = get_year(house_html) data_row = { "name": name, "price": price, "year": year, "x": xy[0], "y": xy[1], "house_type_num": len(structure), "house_structure_area": structure } except Exception: print("----------------error----------------") print(href) time.sleep(10) continue print(data_row) house_data = house_data.append(data_row, ignore_index=True) house_data.to_csv(csv_path) print( city + " " + str(page_id) + ":------------------------------saved--------------------------------------" ) print( city + ":-----------------------------finished-------------------------------" ) print( "-----------------------------finished-------------------------------")
def run(): cities = ["hui", "nb"] maxpage = [100, 100] # cities = ["hz", "cd", "bj", "sh", "hui", "nb"] # maxpage = [100, 100, 100, 100, 100, 100] for city, maxpagei in zip(cities, maxpage): house_data = pd.DataFrame(columns=("name", "month_price", "house_structure", "area", "x", "y")) csv_path = "./data/rent_lianjia_" + city + ".csv" for page_id in range(1, maxpagei): url = "https://" + city + ".lianjia.com/zufang/pg" + str(page_id) try: html = get_html(url) hrefs = get_hrefs(html) except Exception: print("-------------------big error-------------------") print(url) time.sleep(60) continue for href in hrefs: try: house_html = get_html(href) name = get_name(house_html) price = get_price(house_html) xy = get_xy(house_html) structure, area = get_info(house_html) data_row = { "name": name, "month_price": price, "house_structure": structure, "area": area, "x": xy[0], "y": xy[1] } except Exception: print("----------------error----------------") print(href) time.sleep(10) continue house_data = house_data.append(data_row, ignore_index=True) print(data_row) house_data.to_csv(csv_path) print( city + " " + str(page_id) + ":------------------------------saved--------------------------------------" ) print( city + ":-----------------------------finished-------------------------------" ) print( "-----------------------------finished-------------------------------")
def get_structure(house_html, url_head): structures = [] soup = BeautifulSoup(house_html) try: href = url_head + soup.find(attrs={ "class": "h2-flow" }).find("a").attrs["href"] structure_html = get_html(href) structure_soup = BeautifulSoup(structure_html) list_soup = structure_soup.find(attrs={ "class": "main-wrap huxingtu" }).find_all(attrs={"class": "huxing-item"}) for li in list_soup: info = li.find(attrs={"class": "info clear"}) lis = info.find("ul").find_all("li") structure = lis[0].text.split(":")[-1].strip() area = re.search("(([0-9]|\.)+)", lis[1].text).group() total_price = li.find(attrs={"class": "price"}).find("i").text data_row = [{ "structure": structure, "area": int(area), "total_price": int(total_price) }] structures += data_row return structures except Exception: return structures
def get_info_from_house_html(href): try: try: house_html = get_redirect_html(href) soup = BeautifulSoup(house_html) name = get_name(soup) price = get_price(soup) structure = get_structure(soup) area = get_area(soup) xy = get_xy(house_html) except Exception: house_html = get_html(href) soup = BeautifulSoup(house_html) name = get_name(soup) price = get_price(soup) structure = get_structure(soup) area = get_area(soup) xy = get_xy(house_html) data_row = { "name": name, "month_price": price, "house_structure": structure, "area": area, "x": xy[0], "y": xy[1] } return data_row except Exception: return None
def get_xy(html_house): soup_house_html = BeautifulSoup(html_house) href_map = 'https:' + soup_house_html.find(id="iframeBaiduMap").attrs['data-src'] html_map = get_html(href_map) pattern = '_vars.cityx = \"([0-9]|\.)+\";_vars.cityy = \"([0-9]|\.)+\"' result = re.search(pattern, html_map).group() pattern_num = '(([0-9]|\.)+)' xy = re.findall(pattern_num, result) return [float(xy[3][0]), float(xy[1][0])]
def run(): cities = ["hz", "cd", "bj", "sh", "huizhou", "nb"] page_nums = [100, 100, 100, 100, 100, 100] for ind, city in enumerate(cities): house_data = pd.DataFrame(columns=("name", "month_price", "house_structure", "area", "x", "y")) csv_path = "./data1207/rent_fang_" + city + ".csv" if os.path.exists(csv_path): house_data = pd.read_csv(csv_path, index_col=0) url_head = "https://" + city + ".zu.fang.com" if city == "bj": url_head = "https://zu.fang.com/" for page_id in range(1, page_nums[ind] + 1): url = url_head + "/house/i3" + str(page_id) + "/" try: html = get_redirect_html(url) hrefs = get_hrefs(html, url_head) except Exception: try: html = get_html(url) hrefs = get_hrefs(html, url_head) except Exception: print( "--------------------------------big error---------------------------" ) print(url) time.sleep(60) continue thread_list = [] for href in hrefs: thread_list += [ YhcThread(get_info_from_house_html, args=(href, )) ] for t in thread_list: t.start() for t in thread_list: t.join(10) result = t.get_result() if result is not None: print(result) house_data = house_data.append(result, ignore_index=True) else: print("error....") print(t.get_args()) house_data.to_csv(csv_path) print(city + " " + str(page_id) + ":----saved to " + csv_path + "--------------") print( city + "----------------------------finished---------------------------------" ) print( "---------------------------------all finished-----------------------------------------" )
def get_data_row(href): try: if href.find("?") == -1: href = get_redirect_url(href) house_html = get_html(href) name = get_name(house_html) price = get_price(house_html) structure, area, ave_price = get_info(house_html) xy = get_xy(house_html) type_str, build_year = get_type_buildyear(house_html) data_row = {"name": name, "type": type_str, "build_year": build_year, "year": None, "total_price": price, "average_price": ave_price, "house_structure": structure, "area": area, "x": xy[0], "y": xy[1]} return data_row except Exception: return None
def run(): cities = ["hz", "cd", "bj", "sh", "huizhou", "nb"] for city in cities: house_data = pd.DataFrame(columns=( "name", "type", "build_year", "year", "total_price", "average_price", "house_structure", "area", "x", "y")) csv_path = "./data1207/second_hard_house_fang_" + city + ".csv" if os.path.exists(csv_path): house_data = pd.read_csv(csv_path, index_col=0) url_head = "https://" + city + ".esf.fang.com" for page_id in range(1, 101): url = "https://" + city + ".esf.fang.com/house/i3" + str(page_id) + "/" try: url = get_redirect_url(url) html = get_html(url) hrefs = get_hrefs(html, url_head) except Exception: print("-------------------big error-------------------") print(url) time.sleep(60) continue thread_list = [] for href in hrefs: thread_list += [YhcThread(get_data_row, args=(href,))] for t in thread_list: t.start() for t in thread_list: t.join(10) result = t.get_result() if result is not None: print(result) house_data = house_data.append(result, ignore_index=True) else: print("error....") print(t.get_args()) house_data.to_csv(csv_path) print(city + " " + str(page_id) + ":---------saved to " + csv_path + "---------------------") print(city + ":--------------------finished----------------------") print("---------------------all finished-------------------------")
def get_redirect_url(url): html = get_html(url) pattern = "t3='.*?'" part_url_str = re.search(pattern, html).group() part_url = part_url_str.replace("t3='", "").replace("'", "") return url + "?" + part_url
def get_redirect_html(href): href = get_redirect_url(href) house_html = get_html(href) return house_html