Пример #1
0
 def get_build_info(self, build_info_list, co_id, comm_html, url):
     for i in build_info_list:
         try:
             building = Building(2)
             bu_name = i[1]  # 楼栋名称
             bu_num = bu_name.split('#')[0]  # 楼号
             bu_all_house = i[3]  # 总套数
             bu_build_size = i[5]  # 面积
             bu_price = i[9]  # 价格
             # 给对象增加属性
             building.bu_name = bu_name
             building.bu_num = bu_num
             building.bu_all_house = bu_all_house
             building.bu_build_size = bu_build_size
             building.bu_price = bu_price
             building.co_id = co_id  # 小区id
             build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1)
             build_url = re.search(r'<ahref="(.*?)">查看信息<',
                                   build_html).group(1)
             build_id = re.search('buildingId=(.*?)$', build_url).group(1)
             building.bu_id = build_id  # 楼栋id
             building.insert_db()
             self.get_build_detail(build_url, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
Пример #2
0
    def get_build_info(self, url, co_id):
        try:
            building = Building(co_index)
            response = requests.get(url)
            html = response.text
            tree = etree.HTML(html)
            co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0]  # 小区名字
            print(co_name)
            bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0]  # 楼栋名称
            bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0]  # 楼号 栋号
            bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[
                0]  # 总套数
            bu_floor = tree.xpath('//*[@id="cell3-1"]/text()')
            bu_floor = self.is_none(bu_floor)  # 楼层
            bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[
                0]  # 建筑面积
            bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[
                0]  # 住宅面积
            bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()')
            bu_price = self.is_none(bu_price)  # 住宅价格
            bu_id = re.search('\?(\d+)$', url).group(1)  # 楼栋id
            building.co_id = co_id
            building.bu_name = bu_name
            building.bu_num = bu_num
            building.bu_all_house = bu_all_house
            building.bu_floor = bu_floor
            building.bu_build_size = bu_build_size
            building.bu_live_size = bu_live_size
            building.bu_price = bu_price
            building.bu_id = bu_id
            building.insert_db()
            house_info_html = re.findall('<tr id="row3">(.*)$', html,
                                         re.S | re.M)[0]
            for i in re.findall('(<td.*?>.*?</td>)', house_info_html,
                                re.S | re.M):
                if '<br>' not in i:
                    continue
                ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M)
                ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i,
                                               re.S | re.M)
                ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i,
                                     re.S | re.M)[0]
                for i in range(len(ho_name_list)):
                    try:
                        if 'font' in ho_name_list[i]:
                            ho_name = re.sub('<font.*?>', '', ho_name_list[i])
                        else:
                            ho_name = ho_name_list[i]
                        house = House(8)
                        house.ho_name = ho_name
                        house.ho_true_size = ho_true_size_list[i]
                        house.co_id = co_id
                        house.bu_id = bu_id
                        house.ho_type = ho_type
                        house.insert_db()

                    except Exception as e:
                        print(e)
        except BaseException as e:
            print(e)
Пример #3
0
    def get_comm_detail(self, href, comm):
        comm_detail_url = self.URL_FRONT + href
        response = requests.get(url=comm_detail_url, headers=self.headers)
        co_id = response.url
        co_id = int(co_id.split('=')[1])  # 小区id
        html = response.content.decode('gbk')

        co_name = self.regex_common(r'项目名称.*?<td.*?>(.*?)</td>', html)  # 小区名字
        co_owner = self.regex_common(r'房屋所有权证号.*?<td.*?>(.*?)</td>', html)
        co_use = self.regex_common(r'用  途.*?<td.*?>(.*?)</td>', html)
        co_develops = self.regex_common(r'开 发 商.*?<td.*?>(.*?)</td>', html)
        co_address = self.regex_common(r'项目位置.*?<td.*?>(.*?)</td>', html)
        co_pre_sale = self.regex_common(r'预售证号.*?<td.*?>(.*?)</td>', html)
        co_land_use = self.regex_common(r'土地使用权证.*?<td.*?>(.*?)</td>', html)
        co_land_type = self.regex_common(r'土地权证类型.*?<td.*?>(.*?)</td>', html)
        co_handed_time = self.regex_common(r'终止日期.*?<td.*?>(.*?)</td>', html)
        co_plan_pro = self.regex_common(r'规划许可证.*?<td.*?>(.*?)</td>', html)
        co_work_pro = self.regex_common(r'施工许可证.*?<td.*?>(.*?)</td>', html)
        co_type = self.regex_common(r'项目类型.*?<td.*?>(.*?)</td>', html)  # 小区类型
        co_size = self.regex_common(r'批准面积.*?<td.*?>(.*?)</td>', html)  # 占地面积
        comm.co_id = co_id
        comm.co_name = co_name
        comm.co_type = co_type
        comm.co_size = co_size
        comm.co_owner = co_owner
        comm.co_use = co_use
        comm.co_develops = co_develops
        comm.co_address = co_address
        comm.co_pre_sale = co_pre_sale
        comm.co_land_use = co_land_use
        comm.co_land_type = co_land_type
        comm.co_handed_time = co_handed_time
        comm.co_plan_pro = co_plan_pro
        comm.co_work_pro = co_work_pro
        # 获取楼栋url列表
        build_url_list = re.findall(r"<td><a href='(.*?)'", html, re.M | re.S)
        if not build_url_list:
            return
        else:
            for build_url in build_url_list:
                try:
                    building = Building(self.CO_INDEX)
                    build_id = re.search(r'<td>(\d{2,6})</td>', html,
                                         re.M | re.S).group(1)  # 楼栋id
                    bu_all_house = re.search(r'<td>(\d{1,3})</td>', html,
                                             re.M | re.S).group(1)  # 总套数
                    bu_price_demo = re.findall('<td>[\.\d]+</td>', html,
                                               re.M | re.S)[4]
                    bu_price = re.search('\d+', bu_price_demo).group()
                    data_dict = self.get_build_detail(build_url)
                    bu_num = data_dict['bu_num']  # 楼号
                    bu_build_size = data_dict['bu_build_size']  # 建筑面积
                    co_address = data_dict['co_address']  # 小区地址
                    co_build_end_time = data_dict['co_build_end_time']  # 竣工时间
                    co_build_type = data_dict['co_build_type']  # 竣工时间
                    if not co_build_end_time:
                        building.co_is_build = '1'
                    comm.co_address = co_address
                    comm.co_build_end_time = co_build_end_time
                    comm.bu_build_size = bu_build_size
                    comm.co_build_type = co_build_type
                    # 楼栋
                    building.bu_num = bu_num
                    building.bu_build_size = bu_build_size
                    building.bu_all_house = bu_all_house
                    building.bu_id = build_id
                    building.co_id = co_id
                    building.bu_price = bu_price
                    # 插入
                    building.insert_db()
                except Exception as e:
                    build_detail_url = self.URL_FRONT + build_url
                    print('楼栋错误:', build_detail_url)
        comm.insert_db()