Exemplo n.º 1
0
 def get_build_info(self,presell_url_list,co_id):
     for presell_url in presell_url_list:
         pre_url = self.url + presell_url
         res = requests.get(pre_url,headers=self.headers)
         build_url_list = re.findall('【<a href="(.*?)" target="_self"',res.text,re.S|re.M)
         for build_url in build_url_list:
             build_info_url = self.url+build_url
             try:
                 build_res = requests.get(build_info_url,headers=self.headers)
                 con = build_res.text
                 bu = Building(co_index)
                 bu.co_id = co_id
                 bu.bu_id = re.search('ID=(\d+)',build_url).group(1)
                 bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_floor = re.search('总 层 数.*?(\d+)</span',con,re.S|re.M).group(1)
                 bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_not_live_size = re.search('非住宅面积.*?Fzzmj">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 print("co_index={},楼栋信息错误".format(co_index), e)
                 continue
             house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"",con,re.S|re.M)
             self.get_house_info(co_id,bu.bu_id,house_detail_list)
Exemplo n.º 2
0
 def build_parse(self, co_id):
     list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id
     res = requests.get(list_url, headers=self.headers)
     con = res.content.decode()
     build_id_list = re.findall("searchByLid\('(\d+)'\)", con)
     for build_id in build_id_list:
         try:
             bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id
             bu_res = requests.get(bu_url, headers=self.headers)
             bu_con = bu_res.content.decode('gbk')
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_id = build_id
             bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con,
                                   re.S | re.M).group(1)
             bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con,
                                         re.S | re.M).group(1)
             bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con,
                                     re.S | re.M).group(1)
             bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con,
                                          re.S | re.M).group(1)
             bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con,
                                        re.S | re.M).group(1)
             bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con,
                                    re.S | re.M).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('{}楼栋错误{}'.format(build_id, e))
         self.house_parse(co_id, build_id, bu_con)
Exemplo n.º 3
0
 def bu_info(self, bu_list, co_id):
     for bu_ in bu_list[1:]:
         bu = Building(co_index)
         bu.co_id = co_id
         bu.bu_num = bu_.xpath("./td/a/text()")[0]
         bu.bu_pre_sale = bu_.xpath("./td[2]/text()")[0]
         bu.bu_type = bu_.xpath("./td[4]/text()")[0]
         bu_url = bu_.xpath("./td/a/@href")[0]
         bu.bu_id = re.search('buildid=(\d+)', bu_url).group(1)
         bu.insert_db()
         self.ho_info(bu_url, co_id, bu.bu_id)
Exemplo n.º 4
0
    def comm(self, id):
        bu = Building(co_index)

        house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str(id)
        comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str(id)
        comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str(id)

        comm_res = requests.get(comm_url)
        comm_detail_res = requests.get(comm_detail_url)
        house_res = requests.get(house_url)
        comm_dict = json.loads(comm_res.text)
        comm_detail_dict = json.loads(comm_detail_res.text)
        house_dict = json.loads(house_res.text)

        bu.bu_id = id
        bu.bu_num = comm_dict["data"]["nameBuildings"]
        bu.area = comm_detail_dict['data']['houseingArea']
        bu.bu_address = comm_dict["data"]["houseaddress"]
        bu.bu_pre_sale = comm_detail_dict["data"]["yszh"]
        bu.bu_type = comm_dict["data"]["propertycategory"]
        bu.bo_develops = comm_dict["data"]["companyName"]

        bu.insert_db()

        house_num = house_dict["data"]
        for hu in house_num:
            ho = House(co_index)
            h = hu["data"]
            if len(h) > 0:
                for i in h:
                    try:
                        room_id = i["houseNumber"]
                        room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str(
                            room_id)
                        res = requests.get(room_url, headers=self.headers)
                        dict = json.loads(res.text)
                        ho.bu_id = id
                        # ho.ho_num = room_id
                        ho.ho_name = dict["data"]["houseNo"]
                        ho.ho_build_size = dict["data"]["buildArea"]
                        ho.ho_true_size = dict["data"]["jacketArea"]
                        ho.ho_share_size = dict["data"]["apportionedArea"]
                        ho.ho_floor = dict["data"]["nominalLevel"]
                        ho.insert_db()
                    except Exception as e:
                        print(e)
            else:
                continue
Exemplo n.º 5
0
 def get_build_detail(self, all_building_url_list):
     house_url_list = []
     for i in all_building_url_list:
         try:
             response = requests.get(i, headers=self.headers)
             html = response.text
             tree = etree.HTML(html)
             bo_develops = tree.xpath('//*[@id="content_1"]/div[3]/text()[2]')[0]  # 开发商
             bu_build_size = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()')  # 销售面积
             if bu_build_size:
                 bu_build_size = bu_build_size[0]
             bu_pre_sale = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()')  # 预售证书
             if bu_pre_sale:
                 bu_pre_sale = bu_pre_sale[0]
             bu_floor = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0]  # 总层数
             bu_all_house = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0]  # 总套数
             bu_type = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0]  # 房屋用途
             build_html = re.search('houseTable_1.*?当前共有', html, re.S | re.M).group()
             build_detail_html = re.findall('class.*?</a></td>.*?</a></td>.*?</a></td>', build_html, re.S | re.M)
             bu_num = re.findall('项目名称:</b>(.*?)</div>', html, re.S | re.M)[0].strip()
             url_list = []
             for bu in build_detail_html:
                 try:
                     build = Building(co_index)
                     build.bu_id = re.search("href='roomTable.aspx\?id=(.*?)&", bu, re.S | re.M).group(1)
                     build.bu_address = re.search("_blank.*?_blank'>(.*?)</a></td><td>", bu, re.S | re.M).group(
                         1).strip()
                     build.bo_develops = bo_develops
                     build.bu_build_size = bu_build_size
                     build.bu_pre_sale = bu_pre_sale
                     build.bu_num = bu_num
                     build.bu_floor = bu_floor
                     build.bu_all_house = bu_all_house
                     build.bu_type = bu_type
                     for k in self.area_list:
                         if k in build.bu_address:
                             build.area = k
                             continue
                     build.insert_db()
                     house_url = re.search("(roomTable.aspx\?id=.*?&vc=.*?)'", bu, re.S | re.M).group(1)
                     url_list.append('http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' + house_url)
                 except Exception as e:
                     print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
             house_url_list = url_list + house_url_list
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
     return house_url_list
Exemplo n.º 6
0
    def build_parse(self, co_id):
        bu = Building(co_index)

        url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id)
        res = requests.get(url, headers=self.headers)
        con_html = etree.HTML(res.text)
        build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1]
        a = con_html.xpath("//td[@width='54%']")

        for index in range(0, len(build_url_list)):
            try:
                build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index]
                res = requests.get(build_info_url, headers=self.headers)
                con = res.text
                bu.co_id = co_id
                bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con,
                                                re.S | re.M).group(1)
                bu.bu_num = re.search('幢.*?did">(.*?)<', con,
                                      re.S | re.M).group(1)
                bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con,
                                           re.S | re.M).group(1)
                bu.bu_address = re.search('位置.*?ss">(.*?)<', con,
                                          re.S | re.M).group(1)
                bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con,
                                             re.S | re.M).group(1)
                bu.bu_type = re.search('性质.*?type">(.*?)<', con,
                                       re.S | re.M).group(1)
                bu.bu_all_house = re.search('套数.*?number">(.*?)<', con,
                                            re.S | re.M).group(1)
                bu.bu_id = re.search('id=(\d+)',
                                     build_url_list[index]).group(1)

                bu.insert_db()
            except Exception as e:
                print(
                    '楼栋错误,co_index={},url={}'.format(co_index, build_info_url),
                    e)
                continue
            try:
                house_url = a[index].xpath("./a/@href")[0]
                self.house_parse(house_url, co_id, bu.bu_id)
            except Exception as e:
                continue
Exemplo n.º 7
0
    def bu_parse(self, co_id, bulist):
        for bo in bulist:
            bu_url = "http://110.89.45.7:8082" + bo
            bu_res = requests.get(bu_url, headers=self.headers)
            con = bu_res.text
            bu = Building(co_index)
            bu.co_id = co_id
            bu.bu_id = re.search('buildingInfoID=(.*?)&', bo).group(1)
            bu.bu_num = re.search('幢号.*?">(.*?)</', con, re.S | re.M).group(1)
            bu.bu_floor = re.search('总 层 数.*?">(.*?)</', con,
                                    re.S | re.M).group(1)
            bu.bu_live_size = re.search('批准销售.*?">.*?</td.*?">(.*?)</td', con,
                                        re.S | re.M).group(1)
            bu.bu_all_size = re.search('总面积.*?">(.*?)</', con,
                                       re.S | re.M).group(1)
            bu.bu_type = re.search('设计用途.*?">(.*?)</', con,
                                   re.S | re.M).group(1)
            bu.insert_db()

            bu_html = etree.HTML(con)
            ho_list = bu_html.xpath("//td[@style]/a")
            self.ho_parse(co_id, bu.bu_id, ho_list)
Exemplo n.º 8
0
    def analyzer_comm_url(self, comm_url_list):
        all_url = []
        for i in comm_url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                c = Comm(self.co_index)
                c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 项目名称
                c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 项目地址
                c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 开发商
                c.co_build_size = re.search('总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 建筑面积
                c.co_land_type = re.search('用地依据:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 土地使用证
                c.co_all_house = re.search('>总套数:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 总套数
                c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 地区 area
                c.co_work_pro = re.search('施工许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 施工许可证
                c.co_plan_pro = re.search('建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(
                    1)  # 规划许可证
                c.insert_db()

                buildlist = re.findall('onmouseover.*?</TR>', html, re.S | re.M)
                url_list = []
                for k in buildlist:
                    try:
                        b = Building(self.co_index)
                        build_list = re.findall('<TD.*?>(.*?)</TD>', k, re.S | re.M)
                        b.co_name = build_list[1]
                        b.bu_num = build_list[2]
                        b.bu_type = build_list[4]
                        b.insert_db()
                        house_url = re.findall('href="(.*?)"', k, re.S | re.M)
                        for j in house_url:
                            url_list.append('http://www.stfcj.gov.cn/stsite/ProjectList/' + j)
                    except Exception as e:
                        print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
                all_url = all_url + url_list
            except Exception as e:
                print('小区错误,co_index={},url={}'.format(co_index, i), e)
        return all_url
Exemplo n.º 9
0
 def get_build_info(self, build_url_list, co_id):
     for i in build_url_list:
         build_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             build = Building(co_index)
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = re.search('buildingInfoID=(.*?)&',
                                     build_url).group(1)
             build.co_id = co_id
             build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html,
                                        re.S | re.M).group(1)
             build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.insert_db()
             house_url_list = re.findall(
                 'href="(/House/HouseInfo\?HouseCenterID=.*?)"', html,
                 re.S | re.M)
             self.get_house_info(house_url_list, build.bu_id, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Exemplo n.º 10
0
    def bu_info(self,bu_list,co_id):
        for bu in bu_list:
            try:
                bu_url = 'http://www.fxfdcw.com/'+bu
                res = requests.get(bu_url,headers=self.headers)
                con = res.content.decode('gbk')
                html = etree.HTML(con)
                build = Building(co_index)
                build.co_id = co_id
                build.bu_id = re.search('bdid=(\d+)',bu).group(1)
                build.bu_num = re.search('楼号.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_address =  re.search('坐落.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_floor = re.search('地上层数.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</',con,re.S|re.M).group(1)
                build.bu_all_house = re.search('套 数.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_type = re.search('用  途.*?wrap">(.*?)</',con,re.S|re.M).group(1)
                build.insert_db()

                ho_list = html.xpath("//span[@title]")
            except Exception as e:
                # log.error("楼栋信息错误{}".format(e))
                print("楼栋信息错误{}".format(e))
                continue
            self.ho_info(ho_list,co_id,build.bu_id)
Exemplo n.º 11
0
    def get_comm_info(self, comm_info):

        co = Comm(co_index)
        co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1)
        try:
            co.co_address = re.findall('px">(.*?)</td', comm_info)[1]
        except:
            co.co_address = None
        co.area = re.search('center">(.*?)</td>', comm_info).group(1)
        co_detail_url = re.search("href='(.*?)'", comm_info).group(1)
        co_url = "http://www.qyfgj.cn/newys/" + co_detail_url
        try:
            res = requests.get(co_url, headers=self.headers)
        except Exception as e:
            print("co_index={}小区未请求到".format(co_index), e)
        con = res.content.decode('gbk')
        try:
            co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con,
                                       re.S | re.M).group(1)
            co.co_all_house = re.search('总套数.*?">(\d+)&nbsp', con,
                                        re.S | re.M).group(1)
            co.co_all_size = re.search('总面积.*?">(\d+.\d+)&nbsp;m', con,
                                       re.S | re.M).group(1)
        except:
            print("小区无开发商等信息")
        co.insert_db()

        try:
            build = re.findall('<tr bgcolor="white">(.*?)</tr>', con,
                               re.S | re.M)
        except:
            print("小区没有楼栋信息")
        build_headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
            'Referer': co_url
        }

        for build_info in build:
            if "进入" in build_info:
                build_url = re.search('href="(.*?)"><font',
                                      build_info).group(1)
                build_url = "http://www.qyfgj.cn/newys/" + build_url
                ho_headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                    'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                    'Referer': build_url
                }
                build_res = requests.get(build_url, headers=build_headers)
                build_con = build_res.content.decode('gbk')

                if re.search('ID=(\d+)', build_url):  #现售
                    bu = Building(co_index)
                    bu_id = re.search('ID=(\d+)', build_url).group(1)
                    bu.bu_id = bu_id
                    bu.co_name = co.co_name
                    bu.insert_db()
                    self.get_house_info(headers=ho_headers,
                                        bu_id=bu_id,
                                        url=build_url)

                else:  #预售
                    bu = Building(co_index)
                    bu.co_name = co.co_name
                    bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con,
                                           re.S | re.M).group(1)
                    bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>',
                                               build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>',
                                                    build_con,
                                                    re.S | re.M).group(1)
                    bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con,
                                              re.S | re.M).group(1)
                    ret = re.findall('<tr onmouseover(.*?)</tr', build_con,
                                     re.S | re.M)
                    for i in ret:
                        house_url = re.search('href="(.*?)"', i).group(1)
                        house_url = "http://www.qyfgj.cn/newys/" + house_url
                        bu.bu_id = re.search('dbh=(.*?)&', i).group(1)
                        bu.bu_num = re.search('<td width="89.*?">(.*?)</',
                                              i).group(1)
                        bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',
                                                i).group(1)
                        bu.insert_db()

                        ho_res = requests.get(house_url, headers=ho_headers)
                        ho_con = ho_res.content.decode('gbk')
                        new_headers = {
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                            'Cookie':
                            'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                            'Referer': house_url
                        }
                        self.get_house_info(ho_con=ho_con,
                                            headers=new_headers,
                                            bu_id=bu.bu_id)
            else:
                print("楼栋无链接地址")