def do_fetch(): fc.fetch_page("http://www.fang.com/SoufunFamily.htm", "data/temp/index.html") page = fc.read_page("data/temp/index.html", "GBK") page = str(page('#c02').html()) page = re.sub(r'<!--[^>]((?:.|\n)*?)-->', '', page) # 删除多行注释 # page = re.sub(r'<!--[^>]*-->', '', page) 替换单行注释 page = pq(page) city_list = page('a') for city_info in city_list.items(): city_name = city_info.text() href = str(city_info.attr('href')) city_code = fc.get_city_code(href) if city_code is False: continue if not fc.fetch_page(href, "data/temp/temp.html"): continue cl_page = fc.read_page("data/temp/temp.html", "GBK") page_title = cl_page('title').text() if page_title.find(city_name) == -1: logging.warning( f"city_name:{city_name},city_code:{city_code},webtitle != city_name" ) continue temp = conn.get_one( f"select * from inf_city where city_code = '{city_code}' and website = '房天下'" f" and status<>-1") if temp is not None: t_name = temp[2] t_url = temp[5] record = temp[9] if city_name != t_name: record = f"{time.strftime('%Y-%m-%d %H:%M:%S')}:city_name [{t_name}->{city_name}]\n{record}" conn.mysql( f"update inf_city set city_name='{city_name}'" f",update_time='{time.strftime('%Y-%m-%d %H:%M:%S')}'" f",status_record='{record}' where id = '{temp[0]}'") logging.warning( f"ID:{temp[0]}:city_name [{t_name}->{city_name}]") if href != t_url: record = f"{time.strftime('%Y-%m-%d %H:%M:%S')}:url [{t_url}->{href}]\n{record}" conn.mysql( f"update inf_city set url='{href}'" f",update_time='{time.strftime('%Y-%m-%d %H:%M:%S')}'" f",status_record='{record}' where id = '{temp[0]}'") logging.warning(f"ID:{temp[0]}:url [{t_url}->{href}]") continue sql = f"insert into inf_city (city_code, city_name, website, webtitle, url, insert_time)" \ f" values ('{city_code}','{city_name}', '房天下', '{page_title}'," \ f" '{href}', '{time.strftime('%Y-%m-%d %H:%M:%S')}') " conn.mysql(sql)
def do_fetch_housing(url, city_code, city_name): # 获取小区页面做检测 if not fc.fetch_page(url, 'data/temp/temp.html'): return False page = fc.read_page("data/temp/temp.html", "GBK") page_title = page('title').text() if page_title.find(city_name) == -1: logging.warning( f"city_name:{city_name},city_code:{city_code},webtitle != city_name" ) conn.mysql( f"update inf_city set status=8 where city_code='{city_code}'") return False # 获取小区页面最大分页值,如果没有分页则说明没有收录小区信息 page_split = page('#houselist_B14_01') if page_split.html() is None or page_split.html() == 'None': logging.warning(f'没有找到小区信息: {url}') conn.mysql( f"update inf_city set status = 9 where city_code = '{city_code}' and website='房天下'" ) return False page_split = page_split('.txt').text() page_split_max = int(page_split.strip('共页')) # 抓取小区各个分页 for num in range(1, page_split_max + 1): split_url = url + '__0_0_0_0_' + str(num) + '_0_0_0/' file_path = f'data/city/{city_code}/index/{city_code}_{str(num)}.html' if not fc.fetch_page(split_url, file_path): return False conn.mysql( f"update inf_city set status = 1 where city_code = '{city_code}' and website='房天下'" ) return True
def get_detail_url(city_code, community_code): community_page_path = f'data/city/{city_code}/community/{community_code}.html' try: page = fc.read_page(community_page_path, "GBK") except Exception as e: try: page = fc.read_page(community_page_path, "GB2312") except Exception as e: logging.warning(f"读取小区页面编码异常,message={e}") return False detail_url = page('#xqwxqy_C01_17')('div>span>a').attr('href') if detail_url is None: # 这是另外一种页面形式 detail_url = page.find("div[class='floatr']")('a').attr('href') if detail_url is None: if page('#esf_fangyuanlist')('div>span>a').text() == "查看全部房源": return "no_detail" else: logging.warning(f"检测到新的小区页面,小区ID={community_code}") return False return detail_url
def do_main: community_list = conn.get_all("select * from inf_community where status=1") for community in community_list: city_code = community[1] community_code = community[2] page_path = f'data/city/{city_code}/community/{community_code}.html' page = fc.read_page(page_path) if page is False: continue if page == '': parser_one_main(page) if page == '': parser_two_main(page) logging.warning(f"未识别页面,无法选择解析器.") continue
def do_parser(): all_city_code = conn.get_all("select city_code from inf_city where status = 1 and website='房天下'") for City_code in all_city_code: city_code = City_code[0] # 获取城市索引下面所有分页 files = fc.get_all_files(f'data/city/{city_code}/index') for file in files: file_path = f'data/city/{city_code}/index/{file}' file_full_path = Path(__file__).parent.parent.joinpath(f'data/city/{city_code}/index/{file}') if not os.path.isdir(file_full_path): logging.info(f"读取文件:{file}") page_main = fc.read_page(file_path, "GBK") # 截取主体部分 page_main = page_main('.houseList').html() if page_main is None or page_main == "": logging.warning(f"{file}页面没有主体部分") continue # 抓取每条信息更新到数据库 # 可能会出现异常页面如sh_76 主体部分内容为空的,房天下的页面经常更新的,内容会变 try: page_main = pq(page_main) except Exception as e: logging.warning(f"读取主体部分失败, message={e}") continue page_main = page_main.find("div[class='list rel']") for community in page_main.items(): community_img_url = community('dl>dt>a>img').attr('src') community_url = community('dl>dd>p>a').attr('href') community_code = fc.get_city_code(community('dl>dd>p>a').eq(0).attr('href')) if community_code is False: continue community_name = community('dl>dd>p>a').eq(0).text() community_type = community('dl>dd>p>span').eq(0).text() community_level = community('dl>dd>p>span').eq(1).html() if community_level is not None: community_level = community_level.count('<i/>') + 0.5 * community_level.count('<i' ' class="half"/>') else: community_level = 'Null' community_location = fc.remove_html(community('dl>dd>p').eq(1).html(), 1) community_location = community_location.lstrip(' ') community_location = community_location.rstrip(' ') community_selling = fc.remove_html(community('dl>dd>ul>li').eq(0).html()) community_selling = community_selling.replace('套在售', '') community_selling = community_selling.replace('|', '') community_leasing = fc.remove_html(community('dl>dd>ul>li').eq(1).html()) community_leasing = community_leasing.replace('套在租', '') community_leasing = community_leasing.replace('|', '') community_build_time = community('dl>dd>ul>li').eq(2).html() community_price = fc.remove_html(community.find("p[class='priceAverage']").html()) outfor = 0 for x in range(100): temp = conn.get_one(f"select * from inf_community where community_code = '{community_code}'" f" and website = '房天下' and status<>-1 order by fetch_time desc limit 1") # 处理重复 if temp is not None: fetch_time = temp[6] fetch_time = fetch_time.strftime('%Y-%m') this_moon = time.strftime('%Y-%m') if temp[5] == community_url: # code 相同 url 相同的情况下,说明是同一个数据,只判断时间是否当月,当月的执行update, # 并继续大循环,非当月则跳出这个循环,执行 insert # todo 加入完全匹配,如果数据完全一致则不进行 update 减少日志warning输出,如果 city_code 变了抛出提醒,并且不更新数据 if this_moon == fetch_time: record = f"{time.strftime('%Y-%m-%d %H:%M:%S')}:ID:{temp[0]}[{temp[3]}" \ f"->{community_name}]\n{temp[9]}" sql = f"update inf_community set city_code='{city_code}'," \ f"community_name='{community_name}'," \ f"url='{community_url}',update_time='{time.strftime('%Y-%m-%d %H:%M:%S')}'," \ f"status=0,status_record='{record}',img_url='{community_img_url}'," \ f"type='{community_type}',level={community_level}," \ f"location='{community_location}',selling={community_selling}," \ f"leasing={community_leasing},build_time='{community_build_time}'," \ f"price='{community_price}',status=0" \ f" where community_code='{community_code}'" \ f" and website='房天下' and status<>-1" conn.mysql(sql) logging.warning(f"ID:{temp[0]}:community_name [{temp[3]}->{community_name}]") outfor = 1 break # code 相同 url 不同, code + T 继续循环查询新code community_code = str(community_code) + 'T' continue # 无重复 结束循环直接 insert break if outfor == 1: continue sql = f"insert into inf_community" \ f" (city_code,community_code,community_name,website,url,fetch_time,img_url,type,level" \ f",location,selling,leasing,build_time,price)" \ f" values" \ f" ('{city_code}','{community_code}','{community_name}','房天下','{community_url}'" \ f",'{time.strftime('%Y-%m-%d %H:%M:%S')}','{community_img_url}','{community_type}'," \ f"{community_level},'{community_location}',{community_selling}," \ f"{community_leasing},'{community_build_time}','{community_price}')" conn.mysql(sql) conn.mysql(f"update inf_city set status = 2 where city_code = '{city_code}' and website='房天下'")