Пример #1
0
    def bu_info(self, bu_list, co_id):
        for bu in bu_list:
            try:
                bu_url = 'http://www.fxfdcw.com/' + bu
                res = requests.get(bu_url, headers=self.headers)
                con = res.content.decode('gbk')
                html = etree.HTML(con)
                build = Building(co_index)
                build.co_id = co_id
                build.bu_id = re.search('bdid=(\d+)', bu).group(1)
                build.bu_num = re.search('楼号.*?">(.*?)</', con,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('坐落.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                build.bu_floor = re.search('地上层数.*?">(.*?)</', con,
                                           re.S | re.M).group(1)
                build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</', con,
                                                re.S | re.M).group(1)
                build.bu_all_house = re.search('套 数.*?">(.*?)</', con,
                                               re.S | re.M).group(1)
                build.bu_type = re.search('用  途.*?wrap">(.*?)</', con,
                                          re.S | re.M).group(1)
                build.insert_db()

                ho_list = html.xpath("//span[@title]")
            except Exception as e:
                # log.error("楼栋信息错误{}".format(e))
                print("楼栋信息错误{}".format(e))
                continue
            self.ho_info(ho_list, co_id, build.bu_id)
Пример #2
0
 def get_build_info(self, bu_address_list, bu_num_list, bu_floor_list,
                    bu_url_list, co_id):
     for i in range(len(bu_url_list)):
         build = Building(co_index)
         build.bu_address = bu_address_list[i]
         build.bu_num = bu_num_list[i]
         build.bu_floor = bu_floor_list[i]
         build.co_id = co_id
         # response = self.request_proxy('http://183.63.60.194:8808/public/web/' + bu_url_list[i])
         time.sleep(1)
         response = self.s.get('http://183.63.60.194:8808/public/web/' +
                               bu_url_list[i],
                               headers=self.headers)
         build.bu_id = re.search('ljzid=(.*?)$', bu_url_list[i]).group(1)
         build.insert_db()
         html = response.text
         house_html = re.search('var _table_html_.*?</script>', html,
                                re.S | re.M).group()
         house_url_list = re.findall('房屋号:<a.*?href="(.*?)"', house_html,
                                     re.S | re.M)
         try:
             self.get_house_info(house_url_list, build.bu_id)
         except Exception as e:
             print(
                 '房号错误,co_index={},url={}'.format(
                     co_index, 'http://183.63.60.194:8808/public/web/' +
                     bu_url_list[i]), e)
Пример #3
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_code = re.search('xqbm=(.*?)$', i).group(1)
             build_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/donginfo.aspx?xqbm=' + build_code
             build.bu_num = 'Labeldongmc">(.*?)<'
             build.bu_pre_sale = 'Labelyszheng">(.*?)<'
             build.bu_floor = 'Labelsceng">(.*?)<'
             build.bu_address = 'Label1zuoluo">(.*?)<'
             build.bo_build_start_time = 'Label1kaigong">(.*?)<'
             build.co_build_structural = 'Labeljiegou">(.*?)<'
             build.co_id = 'donginfo.aspx\?xqbm=(.*?)"'
             build.bu_id = 'id="DropDownList1".*?value="(.*?)"'
             p = ProducerListUrl(page_url=build_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=build.to_dict(),
                                 current_url_rule='location\.href=(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Пример #4
0
 def get_comm_info(self, comm_info_list):
     for i in comm_info_list:
         build = Building(co_index)
         house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
         build.bu_num = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1)
         build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
         build.bu_pre_sale = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
         build.bu_id = re.search('slbh=(.*?)&', i, re.S | re.M).group(1)
         build.insert_db()
         self.get_house_info(house_url, build.bu_id)
Пример #5
0
    def get_build_info(self, comm_url_list):
        for i in comm_url_list:
            try:
                sid = re.findall('\+(\d+)\+', i)[0]
                pid = re.findall('\+(\d+)\+', i)[1]
                build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid
                # print(build_url)
                response = requests.get(build_url)
                html = response.text
                build = Building(co_index)
                build.bu_id = pid
                build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
                build.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', html,
                                              re.S | re.M).group(1)
                build.bu_pre_sale_date = re.search('时间.*?">(.*?)&nbsp', html,
                                                   re.S | re.M).group(1)
                build.bu_all_house = re.search('dM.*?">(.*?)&nbsp', html,
                                               re.S | re.M).group(1)
                # build.bu_address = re.search('售楼处地址.*?">(.*?)&nbsp', html, re.S | re.M).group(1)
                build.insert_db()
            except Exception as e:
                print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url),
                      e)

            house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001&params=' + sid
            # print(house_url)
            result = requests.get(house_url)
            html_ = result.text

            for house_info in re.findall('<Result.*?</Result>', html_,
                                         re.S | re.M):
                try:
                    house = House(co_index)
                    house.bu_id = build.bu_id
                    house.bu_num = build.bu_num
                    house.ho_name = re.search('<ONAME>(.*?)</ONAME>',
                                              house_info, re.S | re.M).group(1)
                    house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info,
                                             re.S | re.M).group(1)
                    house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>',
                                                    house_info,
                                                    re.S | re.M).group(1)
                    house.ho_floor = re.search('<FORC>(.*?)</FORC>',
                                               house_info,
                                               re.S | re.M).group(1)
                    house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>',
                                                   house_info,
                                                   re.S | re.M).group(1)
                    house.insert_db()
                except Exception as e:
                    print('co_index={}, 房号错误'.format(co_index), e)
Пример #6
0
    def get_build_info(self, co_id, co_name):
        url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name
        response = requests.get(url, headers=self.headers)
        html = response.content.decode('gbk')
        tree = etree.HTML(html)
        xpath_list = tree.xpath('//tr[@class="indextabletxt"]')
        for i in xpath_list[1:]:
            build_url = i.xpath('td[2]/a/@href')[0]
            url = 'http://www.czhome.com.cn/' + build_url
            result = requests.get(url, headers=self.headers)
            if result.status_code is not 200:
                print("co_index={},预售url:{}连接失败".format(co_index, url))
                continue
            html = result.content.decode('gbk')
            tree = etree.HTML(html)
            # 总套数
            bu_xpath = tree.xpath('/html/body/table/tr/td/table/tr/td/table/tr')[1:]
            for i in bu_xpath:
                try:
                    building = Building(7)
                    global building_id
                    building_id += 1
                    building.bu_id = building_id
                    bu_all_house = i.xpath('td[7]/text()')[0]
                    bu_url = i.xpath('td[1]/a/@href')[0]
                    url = 'http://www.czhome.com.cn/' + bu_url
                    response = requests.get(url, headers=self.headers)
                    if response.status_code is not 200:
                        print("co_index={},楼栋url:{}连接失败".format(co_index, url))
                        continue
                    html = response.content.decode('gbk')
                    tree = etree.HTML(html)
                    # 楼层
                    bu_floor = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()')[-1]
                    house_url_list = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href')
                    bu_address = re.search('<center><font color=.*?&nbsp;&nbsp;(.*?)<', html, re.S | re.M).group(1)
                    building.bu_all_house = bu_all_house
                    building.bu_address = bu_address
                    building.bu_floor = bu_floor
                    building.bu_id = building_id
                    building.co_id = co_id
                    building.insert_db()
                    for i in house_url_list:
                        try:
                            house = House(7)
                            house_url = 'http://www.czhome.com.cn/' + i
                            self.get_house_info(house_url, house, co_id, building_id, building)
                        except Exception as e:
                            print(e)


                except Exception as e:
                    print(e)
Пример #7
0
    def comm(self, id):
        bu = Building(co_index)

        house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str(
            id)
        comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str(
            id)
        comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str(
            id)

        comm_res = requests.get(comm_url)
        comm_detail_res = requests.get(comm_detail_url)
        house_res = requests.get(house_url)
        comm_dict = json.loads(comm_res.text)
        comm_detail_dict = json.loads(comm_detail_res.text)
        house_dict = json.loads(house_res.text)

        bu.bu_id = id
        bu.bu_num = comm_dict["data"]["nameBuildings"]
        bu.area = comm_detail_dict['data']['houseingArea']
        bu.bu_address = comm_dict["data"]["houseaddress"]
        bu.bu_pre_sale = comm_detail_dict["data"]["yszh"]
        bu.bu_type = comm_dict["data"]["propertycategory"]
        bu.bo_develops = comm_dict["data"]["companyName"]

        bu.insert_db()

        house_num = house_dict["data"]
        for hu in house_num:
            ho = House(co_index)
            h = hu["data"]
            if len(h) > 0:
                for i in h:
                    try:
                        room_id = i["houseNumber"]
                        room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str(
                            room_id)
                        res = requests.get(room_url, headers=self.headers)
                        dict = json.loads(res.text)
                        ho.bu_id = id
                        # ho.ho_num = room_id
                        ho.ho_name = dict["data"]["houseNo"]
                        ho.ho_build_size = dict["data"]["buildArea"]
                        ho.ho_true_size = dict["data"]["jacketArea"]
                        ho.ho_share_size = dict["data"]["apportionedArea"]
                        ho.ho_floor = dict["data"]["nominalLevel"]
                        ho.insert_db()
                    except Exception as e:
                        print(e)
            else:
                continue
Пример #8
0
 def get_comm_detail(self, detail_url, area):
     try:
         comm = Comm(co_index)
         comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html,
                                     re.S | re.M).group(1)
         comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html,
                                        re.S | re.M).group(1)
         comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<',
                                           html, re.S | re.M).group(1)
         comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_id = re.search('FD=(.*?)&', detail_url,
                                re.S | re.M).group(1)
         comm.area = area
         comm.insert_db()
         build_html = re.search('id="donglist".*?</table>', html,
                                re.S | re.M).group()
         build_info_list = re.findall('<tr.*?</tr>', build_html,
                                      re.S | re.M)
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = comm.co_id
             build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i,
                                      re.S | re.M).group(1)
             build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                                        i, re.S | re.M).group(1)
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             build.bu_id = re.search("LID=(.*?)$", house_url,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(house_url, comm.co_id, build.bu_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url),
               e)
Пример #9
0
 def build_parse(self, co_id):  # 楼栋信息解析
     bu = Building(co_index)
     build_info_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/ProNBList.do"
     formdata = {"pid": co_id, "pageNo": "1", "pageSize": "50"}
     res = requests.post(build_info_url,
                         data=formdata,
                         headers=self.headers)
     con = res.text
     info = re.findall('<tr objid.*?</tr>', con, re.S | re.M)
     for i in info:
         bu.co_id = co_id
         bu.bu_id = re.search('objid="(\d+)"', i).group(1)
         bu.bu_num = re.findall('<span>(.*?)<', i)[1]
         bu.bu_floor = re.search('<td>(\d+)\(', i).group(1)
         bu.bu_address = re.findall('<td>(.*?)</td>', i)[-1]
         bu.insert_db()
         self.house_parse(bu.bu_id, co_id)
Пример #10
0
    def build(self, res, co_id):
        bu = Building(co_index)
        h = etree.HTML(res.text)
        bu_info = h.xpath("//table[@width='739']//td[@align='left']")
        for buil in bu_info:
            try:
                bu.co_id = co_id
                bu.bu_address = buil.xpath("./a/text()")[0]
                house_url = buil.xpath("./a/@href")[0]
                bu.bu_id = re.search('houseid=(\d+)&', house_url).group(1)

                bu.insert_db()
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index),e)
                continue

            self.house(house_url, bu.bu_id, co_id)
Пример #11
0
    def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id):
        for i in range(len(build_url_list)):
            try:
                build = Building(co_index)
                build.co_id = co_id

                build.co_name = co_name
                build.bu_pre_sale = bu_pre_sale_list[i]
                build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1)
                build_url = 'http://221.2.144.162:8090/' + build_url_list[i]
                response = requests.get(build_url, headers=self.headers)
                html = response.content.decode('gbk')
                build.bu_num = re.findall('<font color=white.*?><b>(.*?)<',
                                          html, re.S | re.M)[0]
                build.bu_address = re.findall('坐落位置:</b>(.*?)<', html,
                                              re.S | re.M)[0]
                build.insert_db()
                ho_url_list = re.findall('background-.*?href=(.*?) ', html,
                                         re.S | re.M)
                ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<',
                                          html, re.S | re.M)
                for i in range(len(ho_url_list)):
                    try:
                        house = House(co_index)
                        house_url = 'http://221.2.144.162:8090/' + ho_url_list[
                            i]
                        result = requests.get(
                            house_url,
                            headers=self.headers).content.decode('gbk')
                        house.bu_id = build.bu_id
                        house.co_id = co_id
                        house.ho_type = re.findall(
                            '用&nbsp;&nbsp;&nbsp;途:.*?<td.*?>(.*?)<', result,
                            re.S | re.M)[0]
                        house.ho_build_size = re.findall(
                            '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0]
                        house.bu_num = build.bu_num
                        house.co_name = co_name
                        house.ho_name = ho_name_list[i]
                        house.insert_db()
                    except Exception as e:
                        print("co_index={},房屋信息错误".format(co_index), e)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
Пример #12
0
    def build_parse(self, co_id):
        bu = Building(co_index)

        url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id)
        res = requests.get(url, headers=self.headers)
        con_html = etree.HTML(res.text)
        build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1]
        a = con_html.xpath("//td[@width='54%']")

        for index in range(0, len(build_url_list)):
            try:
                build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index]
                res = requests.get(build_info_url, headers=self.headers)
                con = res.text
                bu.co_id = co_id
                bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con,
                                                re.S | re.M).group(1)
                bu.bu_num = re.search('幢.*?did">(.*?)<', con,
                                      re.S | re.M).group(1)
                bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con,
                                           re.S | re.M).group(1)
                bu.bu_address = re.search('位置.*?ss">(.*?)<', con,
                                          re.S | re.M).group(1)
                bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con,
                                             re.S | re.M).group(1)
                bu.bu_type = re.search('性质.*?type">(.*?)<', con,
                                       re.S | re.M).group(1)
                bu.bu_all_house = re.search('套数.*?number">(.*?)<', con,
                                            re.S | re.M).group(1)
                bu.bu_id = re.search('id=(\d+)',
                                     build_url_list[index]).group(1)

                bu.insert_db()
            except Exception as e:
                print(
                    '楼栋错误,co_index={},url={}'.format(co_index, build_info_url),
                    e)
                continue
            try:
                house_url = a[index].xpath("./a/@href")[0]
                self.house_parse(house_url, co_id, bu.bu_id)
            except Exception as e:
                continue
Пример #13
0
 def get_build_info(self, co_id):
     build_url = 'http://www.yanjifc.com/jdi'
     payload = "activityId=" + str(co_id) + "&module=jtsActBuildingInfo"
     result = requests.post(url=build_url,
                            data=payload,
                            headers=self.headers)
     data = result.json()
     build_list = data['ROWS']['ROW']
     for i in build_list:
         build = Building(co_index)
         build.bu_all_size = self.dict_get(i, 'BUILDING_AREA')
         build.bu_address = self.dict_get(i, 'LOCATION')
         build.bu_num = self.dict_get(i, 'LOCATION')
         build.bu_floor = self.dict_get(i, 'TOTAL_FLOORS')
         build.bu_all_house = self.dict_get(i, 'TOTAL_SET')
         build.co_build_structural = self.dict_get(i, 'STRUCTURE')
         build.bu_id = self.dict_get(i, 'RESOURCE_GUID')
         build.co_id = co_id
         build.insert_db()
         self.get_house_info(co_id, build.bu_id)
Пример #14
0
 def get_build_info(self, build_url_list, co_id):
     for i in build_url_list:
         build_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             build = Building(co_index)
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = re.search('buildingInfoID=(.*?)&', build_url).group(1)
             build.co_id = co_id
             build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.insert_db()
             house_url_list = re.findall('href="(/House/HouseInfo\?HouseCenterID=.*?)"', html, re.S | re.M)
             self.get_house_info(house_url_list, build.bu_id, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Пример #15
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://www.ndjsj.gov.cn/House/' + i
             build.co_name = '项目名称:.*?<td.*?>(.*?)<'
             build.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             build.bu_address = '坐落位置:.*?<td.*?>(.*?)<'
             build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<'
             build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<'
             build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<'
             # build.bu_type = '设计用途:.*?<td.*?>(.*?)<'
             build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=build_url,
                                 request_type='get', encode='utf-8',
                                 analyzer_rules_dict=build.to_dict(),
                                 current_url_rule='javascript:ShowTitle.*?href="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('宁德楼栋错误,url={}'.format(build_url), e)
Пример #16
0
    def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con):

        build = Building(co_index)

        build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con,
                                   re.S | re.M).group(1)
        build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con,
                                        re.S | re.M).group(1)
        build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con,
                                     re.S | re.M).group(1)
        build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con,
                                           re.S | re.M).group(1)
        build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con,
                                              re.S | re.M).group(1)
        build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_pre_sale = bu_pre_sale
        build.bo_develops = bo_develops
        build.co_name = bu_co_name
        build.insert_db()
Пример #17
0
    def get_comm_info(self,comm_info):

        co = Comm(co_index)
        co.co_name = re.search('_blank">(.*?)</a',comm_info).group(1)
        try:
            co.co_address = re.findall('px">(.*?)</td',comm_info)[1]
        except:
            co.co_address = None
        co.area = re.search('center">(.*?)</td>',comm_info).group(1)
        co_detail_url = re.search("href='(.*?)'",comm_info).group(1)
        co_url = "http://www.qyfgj.cn/newys/"+co_detail_url
        try:
            res = requests.get(co_url,headers=self.headers)
        except Exception as e:
            print("co_index={}小区未请求到".format(co_index),e)
        con = res.content.decode('gbk')
        try:
            co.co_develops = re.search('开发商名称.*?px;">(.*?)</a',con,re.S|re.M).group(1)
            co.co_all_house = re.search('总套数.*?">(\d+)&nbsp',con,re.S|re.M).group(1)
            co.co_all_size = re.search('总面积.*?">(\d+.\d+)&nbsp;m',con,re.S|re.M).group(1)
        except:
            print("小区无开发商等信息")
        co.insert_db()

        try:
            build = re.findall('<tr bgcolor="white">(.*?)</tr>',con,re.S|re.M)
        except:
            print("小区没有楼栋信息")
        build_headers = {'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Cookie':
                'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
            'Referer':
                co_url
        }

        for build_info in build:
            if "进入" in build_info:
                build_url = re.search('href="(.*?)"><font',build_info).group(1)
                build_url = "http://www.qyfgj.cn/newys/" + build_url
                ho_headers={
                    'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                    'Cookie':
                        'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                    'Referer':
                        build_url
                }
                build_res = requests.get(build_url, headers=build_headers)
                build_con = build_res.content.decode('gbk')

                if re.search('ID=(\d+)',build_url):   #现售
                    bu = Building(co_index)
                    bu_id = re.search('ID=(\d+)',build_url).group(1)
                    bu.bu_id = bu_id
                    bu.co_name =co.co_name
                    bu.insert_db()
                    self.get_house_info(headers=ho_headers,bu_id=bu_id,url=build_url)

                else:                                  #预售
                    bu = Building(co_index)
                    bu.co_name = co.co_name
                    bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1)
                    bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1)
                    ret = re.findall('<tr onmouseover(.*?)</tr',build_con,re.S|re.M)
                    for i in ret:
                        house_url = re.search('href="(.*?)"',i).group(1)
                        house_url = "http://www.qyfgj.cn/newys/" + house_url
                        bu.bu_id = re.search('dbh=(.*?)&',i).group(1)
                        bu.bu_num = re.search('<td width="89.*?">(.*?)</',i).group(1)
                        bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',i).group(1)
                        bu.insert_db()

                        ho_res = requests.get(house_url,headers=ho_headers)
                        ho_con = ho_res.content.decode('gbk')
                        new_headers = {
                            'User-Agent':
                                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                            'Cookie':
                                'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                            'Referer':
                                house_url
                        }
                        self.get_house_info(ho_con=ho_con,headers=new_headers,bu_id=bu.bu_id)
            else:
                print("楼栋无链接地址")
Пример #18
0
 def get_build_detail(self, all_building_url_list):
     house_url_list = []
     for i in all_building_url_list:
         try:
             response = requests.get(i, headers=self.headers)
             html = response.text
             tree = etree.HTML(html)
             bo_develops = tree.xpath(
                 '//*[@id="content_1"]/div[3]/text()[2]')[0]  # 开发商
             bu_build_size = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()')  # 销售面积
             if bu_build_size:
                 bu_build_size = bu_build_size[0]
             bu_pre_sale = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()')  # 预售证书
             if bu_pre_sale:
                 bu_pre_sale = bu_pre_sale[0]
             bu_floor = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0]  # 总层数
             bu_all_house = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0]  # 总套数
             bu_type = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0]  # 房屋用途
             build_html = re.search('houseTable_1.*?当前共有', html,
                                    re.S | re.M).group()
             build_detail_html = re.findall(
                 'class.*?</a></td>.*?</a></td>.*?</a></td>', build_html,
                 re.S | re.M)
             bu_num = re.findall('项目名称:</b>(.*?)</div>', html,
                                 re.S | re.M)[0].strip()
             url_list = []
             for bu in build_detail_html:
                 try:
                     build = Building(co_index)
                     build.bu_id = re.search(
                         "href='roomTable.aspx\?id=(.*?)&", bu,
                         re.S | re.M).group(1)
                     build.bu_address = re.search(
                         "_blank.*?_blank'>(.*?)</a></td><td>", bu,
                         re.S | re.M).group(1).strip()
                     build.bo_develops = bo_develops
                     build.bu_build_size = bu_build_size
                     build.bu_pre_sale = bu_pre_sale
                     build.bu_num = bu_num
                     build.bu_floor = bu_floor
                     build.bu_all_house = bu_all_house
                     build.bu_type = bu_type
                     for k in self.area_list:
                         if k in build.bu_address:
                             build.area = k
                             continue
                     build.insert_db()
                     house_url = re.search(
                         "(roomTable.aspx\?id=.*?&vc=.*?)'", bu,
                         re.S | re.M).group(1)
                     url_list.append(
                         'http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' +
                         house_url)
                 except Exception as e:
                     print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
             house_url_list = url_list + house_url_list
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
     return house_url_list
Пример #19
0
 def start_crawler(self):
     url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx'
     for i in range(1, 21):
         payload = "{\"pageNo\":" + str(
             i) + ",\"pageSize\":30,\"rowcount\":589}"
         try:
             response = requests.post(url,
                                      data=payload,
                                      headers=self.headers)
             con = response.content.decode()
         except Exception as e:
             log.error('楼栋请求失败{}'.format(e))
             continue
         co_list = re.findall('\[\d+,.*?\d+\]', con)
         for comm in co_list:
             try:
                 sid = re.search('\[(\d+),', comm).group(1)
                 pid = re.search('",(\d+),', comm).group(1)
                 bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid
                 bu_res = requests.get(bu_url, headers=self.headers)
                 bu_con = bu_res.content.decode()
                 bu = Building(co_index)
                 bu.bu_id = sid
                 bu.bu_address = re.search('楼栋座落.*?">(.*?)&nbsp', bu_con,
                                           re.S | re.M).group(1)
                 bu.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', bu_con,
                                            re.S | re.M).group(1)
                 bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?)&nbsp',
                                                 bu_con,
                                                 re.S | re.M).group(1)
                 bu.bu_all_house = re.search('套数.*?">(.*?)&nbsp', bu_con,
                                             re.S | re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 log.error("{}楼栋解析失败{}".format(comm, e))
                 continue
             ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx'
             data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}"
             headers = {
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                 'X-AjaxPro-Method': 'GETLPBDS'
             }
             try:
                 ho_res = requests.post(ho_url, data=data, headers=headers)
                 ho_con = ho_res.content.decode()
             except Exception as e:
                 log.error("房屋请求失败{}".format(e))
                 continue
             ho_list = re.findall('\["\d+.*?\d+\]', ho_con)
             for house in ho_list:
                 try:
                     ho = House(co_index)
                     ho.bu_id = sid
                     info_list = house.split(",")
                     ho.ho_name = info_list[4]
                     ho.ho_floor = re.search('(\d+)层', house).group(1)
                     ho.ho_build_size = info_list[-3]
                     ho.ho_true_size = info_list[-2]
                     ho.insert_db()
                 except Exception as e:
                     log.error("{}房屋解析错误{}".format(house, e))
                     continue
Пример #20
0
    def get_comm_detail(self, href, comm):
        comm_detail_url = self.URL_FRONT + href
        response = requests.get(url=comm_detail_url)
        co_id = response.url
        co_id = int(co_id.split('=')[1])  # 小区id
        html = response.content.decode('gbk')

        co_name = self.regex_common(r'项目名称.*?<td.*?>(.*?)</td>', html)  # 小区名字
        co_owner = self.regex_common(r'房屋所有权证号.*?<td.*?>(.*?)</td>', html)
        co_use = self.regex_common(r'用  途.*?<td.*?>(.*?)</td>', html)
        co_develops = self.regex_common(r'开 发 商.*?<td.*?>(.*?)</td>', html)
        co_address = self.regex_common(r'项目位置.*?<td.*?>(.*?)</td>', html)
        co_pre_sale = self.regex_common(r'预售证号.*?<td.*?>(.*?)</td>', html)
        co_land_use = self.regex_common(r'土地使用权证.*?<td.*?>(.*?)</td>', html)
        co_land_type = self.regex_common(r'土地权证类型.*?<td.*?>(.*?)</td>', html)
        co_handed_time = self.regex_common(r'终止日期.*?<td.*?>(.*?)</td>', html)
        co_plan_pro = self.regex_common(r'规划许可证.*?<td.*?>(.*?)</td>', html)
        co_work_pro = self.regex_common(r'施工许可证.*?<td.*?>(.*?)</td>', html)
        co_type = self.regex_common(r'项目类型.*?<td.*?>(.*?)</td>', html)  # 小区类型
        co_size = self.regex_common(r'批准面积.*?<td.*?>(.*?)</td>', html)  # 占地面积
        comm.co_id = co_id
        comm.co_name = co_name
        comm.co_type = co_type
        comm.co_size = co_size
        comm.co_owner = co_owner
        comm.co_use = co_use
        comm.co_develops = co_develops
        comm.co_address = co_address
        comm.co_pre_sale = co_pre_sale
        comm.co_land_use = co_land_use
        comm.co_land_type = co_land_type
        comm.co_handed_time = co_handed_time
        comm.co_plan_pro = co_plan_pro
        comm.co_work_pro = co_work_pro
        # 获取楼栋url列表
        build_url_list = re.findall(r"<td><a href='(.*?)'", html, re.M | re.S)
        if not build_url_list:
            return
        else:
            for build_url in build_url_list:
                try:
                    building = Building(self.CO_INDEX)
                    build_id = re.search(r'<td>(\d{2,6})</td>', html, re.M | re.S).group(1)  # 楼栋id
                    bu_all_house = re.search(r'<td>(\d{1,3})</td>', html, re.M | re.S).group(1)  # 总套数
                    bu_price_demo = re.findall('<td>[\.\d]+</td>', html, re.M | re.S)[4]
                    bu_price = re.search('\d+', bu_price_demo).group()
                    data_dict = self.get_build_detail(build_url)
                    bu_num = data_dict['bu_num']  # 楼号
                    bu_build_size = data_dict['bu_build_size']  # 建筑面积
                    bu_address = data_dict['co_address']
                    co_build_end_time = data_dict['co_build_end_time']  # 竣工时间
                    co_build_type = data_dict['co_build_type']  # 竣工时间
                    if not co_build_end_time:
                        building.co_is_build = '1'
                    comm.co_build_end_time = co_build_end_time
                    comm.bu_build_size = bu_build_size
                    comm.co_build_type = co_build_type
                    # 楼栋
                    building.bu_address = bu_address
                    building.bu_num = bu_num
                    building.bu_build_size = bu_build_size
                    building.bu_all_house = bu_all_house
                    building.bu_id = build_id
                    building.co_id = co_id
                    building.bu_price = bu_price
                    # 插入
                    building.insert_db()
                    house_url = re.search(r'href="/(tt/business/buildingRooms_view.*?)"', html, re.M | re.S).group(1)
                    self.get_house_info(house_url, build_id, co_id)
                except Exception as e:
                    build_detail_url = self.URL_FRONT + build_url
                    print('楼栋错误:', build_detail_url)
        comm.insert_db()