Exemplo n.º 1
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://www.ndjsj.gov.cn/House/' + i
             build.co_name = '项目名称:.*?<td.*?>(.*?)<'
             build.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             build.bu_address = '坐落位置:.*?<td.*?>(.*?)<'
             build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<'
             build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<'
             build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<'
             # build.bu_type = '设计用途:.*?<td.*?>(.*?)<'
             build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(
                 page_url=build_url,
                 request_type='get',
                 encode='utf-8',
                 analyzer_rules_dict=build.to_dict(),
                 current_url_rule='javascript:ShowTitle.*?href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('宁德楼栋错误,url={}'.format(build_url), e)
Exemplo n.º 2
0
    def build_info(self, build_url_list):
        bu = Building(co_index)

        for build_url in build_url_list:
            url = "http://ris.szpl.gov.cn/bol/" + build_url
            res = requests.get(url, headers=self.headers)
            con = etree.HTML(res.text)
            branch_list = con.xpath("//div[@id='divShowBranch/a/@href")
            for branch in branch_list:
                branch_url = "http://ris.szpl.gov.cn/bol/" + branch
                response = requests.get(branch_url, headers=self.headers)

                content = etree.HTML(response.text)
                bu.bu_num = content.xpath(
                    "//div[@id='curAddress']/a/text()")[2]
                bu.co_name = content.xpath(
                    "//div[@id='curAddress']/a/text()")[1]
                co_info = content.xpath("//form/@action")[0]
                bu.bu_id = bu_id = re.search('\?id=(\d+)&', co_info).group(1)
                bu.co_id = co_id = re.search('presellid=(\d+)&',
                                             co_info).group(1)

                bu.insert_db()
                house_list = content.xpath(
                    "//div[@id='updatepanel1']//tr[@class='a1']//a/@href"
                )[2:-1]

                self.house_info(house_list, bu_id, co_id)
Exemplo n.º 3
0
    def get_build_info(self, build_url_list, co_id):
        for i in build_url_list:

            build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '')
            res = requests.get(build_url)

            co_name = re.search('ctl15_proname">(.*?)<', res.text, re.S | re.M).group(1)
            str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text, re.S | re.M).group()
            for info in re.findall('<tr>.*?</tr>', str, re.S | re.M):
                if 'href' not in info:
                    continue
                try:
                    build = Building(co_index)
                    build.co_name = co_name
                    build.bu_num = re.search('<tr>.*?<td>.*?<a href=.*?>(.*?)<', info, re.S | re.M).group(1)
                    build.bu_pre_sale = re.search('onclick="BinSHouseInfo.*?>(.*?)<', info, re.S | re.M).group(1)
                    build.bu_pre_sale_date = re.search('onclick="BinSHouseInfo.*?<td>(.*?)<', info, re.S | re.M).group(
                        1)
                    build.bu_all_house = re.search('color:#ec5f00;">(.*?)<', info, re.S | re.M).group(1)
                    build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info, re.S | re.M).group(1)
                    build.co_id = co_id
                    build.insert_db()

                except Exception as e:
                    print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
            house_url_list = re.findall("</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看", res.text,
                                        re.S | re.M)

            self.get_house_info(house_url_list)
Exemplo n.º 4
0
    def comm_info(
        self,
        con,
    ):
        # 小区及楼栋
        comm = Comm(co_index)

        comm.co_name = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()"
        )[0]  # 小区名称
        co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0]  # 小区id
        comm.co_id = re.search(r"\d+", co_id_str).group(0)
        comm.co_address = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[
                0]  # 小区地址
        comm.co_develops = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[
                0]  # 开发商
        comm.co_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0]  # 总面积
        comm.co_build_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[
                0]  # 建筑面积
        comm.co_build_end_time = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[
                0]  # 竣工时间
        comm.co_plan_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[
                0]  # 用地规划许可
        comm.co_work_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0]  # 施工许可
        comm.co_green = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()"
        )[0]  # 绿地百分比
        comm.co_land_use = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0]  # 土地使用证

        comm.insert_db()

        build = Building(co_index)
        build_table = con.xpath("//tr[@style='color:#000066;']")
        room_list = []
        for build_list in build_table:
            build.co_id = comm.co_id
            build.co_name = comm.co_name
            build_info = build_list.xpath("./td/text()")
            build.bu_id = build_info[0]
            build.bu_num = build_info[1]
            build.bu_all_house = build_info[2]
            build.size = build_info[3]
            build.bu_floor = build_info[4]
            build.bu_pre_sale = build_info[5]

            build.insert_db()

            room_url = build_list.xpath("./td/a/@href")[0]
            room_list.append(room_url)

        return room_list
Exemplo n.º 5
0
 def get_build_info(self, build_url_list, co_name):
     for i in build_url_list:
         try:
             build = Building(co_index)
             code = i.split(',,')
             build.bu_num = code[1]
             build.co_name = co_name
             build.insert_db()
             self.get_house_info(code, co_name)
         except Exception as e:
             print(e)
Exemplo n.º 6
0
 def get_build_info(self, build_url_list, co_name):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build.co_name = co_name
             build_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0]
             for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"',
                                 html, re.S | re.M):
                 try:
                     build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k
                     result = requests.get(build_url_detail,
                                           headers=self.headers)
                     content = result.text
                     build.bu_num = re.findall(
                         'BuildingInfo1_lblBuildingName">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_all_house = re.findall(
                         'BuildingInfo1_lblZts">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_floor = re.findall(
                         'BuildingInfo1_lblZcs">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_build_size = re.findall(
                         'BuildingInfo1_lblJzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_live_size = re.findall(
                         'BuildingInfo1_lblZzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale = re.findall(
                         'BuildingInfo1_lblYsxkzh">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale_date = re.findall(
                         'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content,
                         re.S | re.M)[0]
                     build.insert_db()
                     house_url_list = re.findall(
                         "onClick=.getMoreHouseInfo\('(.*?)'\)", content,
                         re.S | re.M)
                     self.get_house_info(house_url_list, co_name,
                                         build.bu_num)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print(e)
Exemplo n.º 7
0
    def get_comm_info(self, comm_url_list):
        for i in comm_url_list:
            comm = Comm(co_index)
            comm_url = 'http://old.newhouse.cnnbfdc.com/' + i
            try:
                response = requests.get(comm_url, headers=self.headers)
            except Exception as e:
                print("{}城市无法访问小区{}".format(city, comm_url), e)
                continue

            html = response.text
            con = etree.HTML(html)
            comm.co_id = re.search('id=(\d+)', i).group(1)
            comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html,
                                      re.S | re.M)[0]
            comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M)[0]
            comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0]
            comm.co_pre_sale = re.findall('售证名称:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0]
            comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html,
                                            re.S | re.M)[0]
            comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html,
                                           re.S | re.M)[0]
            comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html,
                                   re.S | re.M)[0]
            comm.insert_db()
            bu_all_house_list = re.findall(
                'window.open.*?center.*?center.*?>(.*?)<', html, re.S | re.M)
            try:
                bu_url_list = re.findall("window\.open\('(.*?)'", html,
                                         re.S | re.M)
            except Exception as e:
                print("{}城市{}小区无楼栋".format(city, comm.co_name), e)
                continue
            for i in range(len(bu_url_list)):
                build = Building(co_index)
                bu_url = bu_url_list[i]
                build.bu_all_house = bu_all_house_list[i]
                build.co_name = comm.co_name
                build.bu_num = con.xpath("//a[@href='#']/@title")[i]
                build.bu_id = re.search('key=(\d+)&', bu_url).group(1)
                build.co_id = comm.co_id
                build.insert_db()
                self.get_house_info(bu_url, build.bu_id)
Exemplo n.º 8
0
    def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id):
        for i in range(len(build_url_list)):
            try:
                build = Building(co_index)
                build.co_id = co_id

                build.co_name = co_name
                build.bu_pre_sale = bu_pre_sale_list[i]
                build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1)
                build_url = 'http://221.2.144.162:8090/' + build_url_list[i]
                response = requests.get(build_url, headers=self.headers)
                html = response.content.decode('gbk')
                build.bu_num = re.findall('<font color=white.*?><b>(.*?)<',
                                          html, re.S | re.M)[0]
                build.bu_address = re.findall('坐落位置:</b>(.*?)<', html,
                                              re.S | re.M)[0]
                build.insert_db()
                ho_url_list = re.findall('background-.*?href=(.*?) ', html,
                                         re.S | re.M)
                ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<',
                                          html, re.S | re.M)
                for i in range(len(ho_url_list)):
                    try:
                        house = House(co_index)
                        house_url = 'http://221.2.144.162:8090/' + ho_url_list[
                            i]
                        result = requests.get(
                            house_url,
                            headers=self.headers).content.decode('gbk')
                        house.bu_id = build.bu_id
                        house.co_id = co_id
                        house.ho_type = re.findall(
                            '用&nbsp;&nbsp;&nbsp;途:.*?<td.*?>(.*?)<', result,
                            re.S | re.M)[0]
                        house.ho_build_size = re.findall(
                            '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0]
                        house.bu_num = build.bu_num
                        house.co_name = co_name
                        house.ho_name = ho_name_list[i]
                        house.insert_db()
                    except Exception as e:
                        print("co_index={},房屋信息错误".format(co_index), e)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
Exemplo n.º 9
0
 def start_crawler(self):
     response = requests.get(url)
     html = response.text
     tree = etree.HTML(html)
     all_url = tree.xpath('//a[@class="a_name"]/@href')
     for i in all_url:
         comm = Comm(co_index)
         if i == '#':
             continue
         comm_url = 'http://www.lzfc.com.cn:8080' + i
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_address = "cc1.innerHTML='(.*?)'"
         comm.area = "cc2.innerHTML='(.*?)'"
         comm.co_use = "cc4.innerHTML='(.*?)'"
         comm.co_develops = "cc5.innerHTML='(.*?)'"
         comm.co_open_time = "cc6.innerHTML='(.*?)'"
         comm.co_all_house = "cc9.innerHTML='(.*?)'"
         comm.co_build_size = "cc11.innerHTML='(.*?)'"
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_id = "BaseCode=(.*?)'"
         p = ProducerListUrl(page_url=comm_url,
                             request_type='get', encode='gbk',
                             analyzer_rules_dict=comm.to_dict(),
                             current_url_rule="queryBuildHerf1.href='(.*?)'",
                             analyzer_type='regex')
         build_url = p.get_details()
         for i in build_url:
             build = Building(co_index)
             build_detail_url = 'http://www.lzfc.com.cn:8080' + i
             build.bu_num = 'onclick=comInfoView.*?center">(.*?)<'
             build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<'
             build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<'
             build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<'
             build.co_name = 'fontbg_red">(.*?)<'
             build.bu_id = "onclick=comInfoView\('(.*?)'\)"
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get', encode='gbk',
                                 analyzer_rules_dict=comm.to_dict(),
                                 current_url_rule="queryBuildHerf1.href='(.*?)'",
                                 analyzer_type='regex')
             build_url = p.get_details()
Exemplo n.º 10
0
    def analyzer_comm_url(self, comm_url_list):
        all_url = []
        for i in comm_url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                c = Comm(self.co_index)
                c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 项目名称
                c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 项目地址
                c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 开发商
                c.co_build_size = re.search('总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 建筑面积
                c.co_land_type = re.search('用地依据:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 土地使用证
                c.co_all_house = re.search('>总套数:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 总套数
                c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 地区 area
                c.co_work_pro = re.search('施工许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1)  # 施工许可证
                c.co_plan_pro = re.search('建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(
                    1)  # 规划许可证
                c.insert_db()

                buildlist = re.findall('onmouseover.*?</TR>', html, re.S | re.M)
                url_list = []
                for k in buildlist:
                    try:
                        b = Building(self.co_index)
                        build_list = re.findall('<TD.*?>(.*?)</TD>', k, re.S | re.M)
                        b.co_name = build_list[1]
                        b.bu_num = build_list[2]
                        b.bu_type = build_list[4]
                        b.insert_db()
                        house_url = re.findall('href="(.*?)"', k, re.S | re.M)
                        for j in house_url:
                            url_list.append('http://www.stfcj.gov.cn/stsite/ProjectList/' + j)
                    except Exception as e:
                        print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
                all_url = all_url + url_list
            except Exception as e:
                print('小区错误,co_index={},url={}'.format(co_index, i), e)
        return all_url
Exemplo n.º 11
0
    def get_build_info(self, build_url_list):
        for i in build_url_list:
            try:
                build = Building(co_index)
                build_url = 'http://www.fjnpfdc.com/House/' + i
                res = requests.get(build_url, headers=self.headers)
                con = res.content.decode('gbk')
                build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con,
                                          re.S | re.M).group(1)
                build.bu_num = re.search("幢  号:.*?<td.*?>(.*?)<", con,
                                         re.S | re.M).group(1)
                build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con,
                                         re.S | re.M).group(1)
                build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<",
                                                      con,
                                                      re.S | re.M).group(1)
                build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con,
                                           re.S | re.M).group(1)
                build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con,
                                                re.S | re.M).group(1)
                build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<",
                                                    con, re.S | re.M).group(1)

                house_url_list = re.findall('<a href="(HouseInfo.*?)"', con)
                # p = ProducerListUrl(page_url=build_url,
                #                     request_type='get', encode='gbk',
                #                     analyzer_rules_dict=build.to_dict(),
                #                     current_url_rule='<a href="(HouseInfo.*?)"',
                #                     analyzer_type='regex',
                #                     headers=self.headers)
                build.co_id = re.search('ProjectId=(.*?)&', i).group(1)
                build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1)
                build.insert_db()
                # house_url_list = p.get_details()
                self.get_house_info(house_url_list, build.bu_id, build.co_id)
            except Exception as e:
                print("co_index={},楼栋{}错误".format(co_index, i), e)
Exemplo n.º 12
0
 def get_build_info(self, build_url_list, co_id):
     for i in build_url_list:
         build_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             build = Building(co_index)
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = re.search('buildingInfoID=(.*?)&',
                                     build_url).group(1)
             build.co_id = co_id
             build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html,
                                        re.S | re.M).group(1)
             build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.insert_db()
             house_url_list = re.findall(
                 'href="(/House/HouseInfo\?HouseCenterID=.*?)"', html,
                 re.S | re.M)
             self.get_house_info(house_url_list, build.bu_id, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Exemplo n.º 13
0
    def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con):

        build = Building(co_index)

        build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con,
                                   re.S | re.M).group(1)
        build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con,
                                        re.S | re.M).group(1)
        build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con,
                                     re.S | re.M).group(1)
        build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con,
                                           re.S | re.M).group(1)
        build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con,
                                              re.S | re.M).group(1)
        build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_pre_sale = bu_pre_sale
        build.bo_develops = bo_develops
        build.co_name = bu_co_name
        build.insert_db()
Exemplo n.º 14
0
    def get_comm_info(self, comm_info):

        co = Comm(co_index)
        co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1)
        try:
            co.co_address = re.findall('px">(.*?)</td', comm_info)[1]
        except:
            co.co_address = None
        co.area = re.search('center">(.*?)</td>', comm_info).group(1)
        co_detail_url = re.search("href='(.*?)'", comm_info).group(1)
        co_url = "http://www.qyfgj.cn/newys/" + co_detail_url
        try:
            res = requests.get(co_url, headers=self.headers)
        except Exception as e:
            print("co_index={}小区未请求到".format(co_index), e)
        con = res.content.decode('gbk')
        try:
            co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con,
                                       re.S | re.M).group(1)
            co.co_all_house = re.search('总套数.*?">(\d+)&nbsp', con,
                                        re.S | re.M).group(1)
            co.co_all_size = re.search('总面积.*?">(\d+.\d+)&nbsp;m', con,
                                       re.S | re.M).group(1)
        except:
            print("小区无开发商等信息")
        co.insert_db()

        try:
            build = re.findall('<tr bgcolor="white">(.*?)</tr>', con,
                               re.S | re.M)
        except:
            print("小区没有楼栋信息")
        build_headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
            'Referer': co_url
        }

        for build_info in build:
            if "进入" in build_info:
                build_url = re.search('href="(.*?)"><font',
                                      build_info).group(1)
                build_url = "http://www.qyfgj.cn/newys/" + build_url
                ho_headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                    'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                    'Referer': build_url
                }
                build_res = requests.get(build_url, headers=build_headers)
                build_con = build_res.content.decode('gbk')

                if re.search('ID=(\d+)', build_url):  #现售
                    bu = Building(co_index)
                    bu_id = re.search('ID=(\d+)', build_url).group(1)
                    bu.bu_id = bu_id
                    bu.co_name = co.co_name
                    bu.insert_db()
                    self.get_house_info(headers=ho_headers,
                                        bu_id=bu_id,
                                        url=build_url)

                else:  #预售
                    bu = Building(co_index)
                    bu.co_name = co.co_name
                    bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con,
                                           re.S | re.M).group(1)
                    bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>',
                                               build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>',
                                                    build_con,
                                                    re.S | re.M).group(1)
                    bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con,
                                              re.S | re.M).group(1)
                    ret = re.findall('<tr onmouseover(.*?)</tr', build_con,
                                     re.S | re.M)
                    for i in ret:
                        house_url = re.search('href="(.*?)"', i).group(1)
                        house_url = "http://www.qyfgj.cn/newys/" + house_url
                        bu.bu_id = re.search('dbh=(.*?)&', i).group(1)
                        bu.bu_num = re.search('<td width="89.*?">(.*?)</',
                                              i).group(1)
                        bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',
                                                i).group(1)
                        bu.insert_db()

                        ho_res = requests.get(house_url, headers=ho_headers)
                        ho_con = ho_res.content.decode('gbk')
                        new_headers = {
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                            'Cookie':
                            'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                            'Referer': house_url
                        }
                        self.get_house_info(ho_con=ho_con,
                                            headers=new_headers,
                                            bu_id=bu.bu_id)
            else:
                print("楼栋无链接地址")
Exemplo n.º 15
0
    def get_build_url_list(self, url_list):
        for i in url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                for k in re.findall('项目名称.*?</dl>', html, re.S | re.M):
                    try:
                        c = Comm(self.co_index)
                        c.co_name = re.search('html">(.*?)</a>', k,
                                              re.S | re.M).group(1)
                        c.co_address = re.search('class="address"(.*?)</dd>',
                                                 k, re.S | re.M).group(1)
                        c.area = re.search('"city">(.*?)</dd>', k,
                                           re.S | re.M).group(1)
                        c.co_develops = re.search('"average">(.*?)</dd>', k,
                                                  re.S | re.M).group(1)
                        c.insert_db()
                        global count
                        count += 1
                        print(count)

                        url = re.search('a href="(.*?)">', k,
                                        re.S | re.M).group(1)
                        complete_url = self.url_source + url
                        res = requests.get(complete_url)
                        html = res.content.decode('gbk')
                        build_info_str = re.search('楼盘表</td>(.*?)合  计', html,
                                                   re.S | re.M).group(1)
                        for j in re.findall('<tr.*?</tr>', build_info_str,
                                            re.S | re.M):
                            try:
                                b = Building(self.co_index)
                                b.co_name = re.search('html">(.*?)</a>', k,
                                                      re.S | re.M).group(1)
                                b.bu_all_house = re.search(
                                    'absmiddle"  />(.*?)</a>', j,
                                    re.S | re.M).group(1)
                                b.bu_num = re.search(
                                    '="absmiddle"  />(.*?)</a></strong></', j,
                                    re.S | re.M).group(1)
                                b.bu_build_size = re.search(
                                    'td class="t_c">.*?td class="t_c">(.*?㎡)</td>',
                                    j, re.S | re.M).group(1)
                                b.insert_db()

                                url = re.search('a href="(.*?)"', j,
                                                re.S | re.M).group(1)
                                complete_url = self.url_source + url
                                res = requests.get(complete_url)
                                html = res.content.decode('gbk')
                                # 解析html获取iframe表单的数据
                                house_url = self.url_source + re.search(
                                    '<iframe.*?"(.*?)"', html,
                                    re.S | re.M).group(1)
                                logic_house_url = house_url.replace(
                                    'Default', 'GetData')
                                logic_house_html = requests.get(
                                    url=logic_house_url).content.decode()
                                logic_id = re.search(
                                    '<LOGICBUILDING_ID>(.*?)<',
                                    logic_house_html, re.S | re.M).group(1)
                                final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id
                                final_html = requests.get(
                                    url=final_url).content.decode('gbk')
                                for l in re.findall(
                                        '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>',
                                        final_html, re.S | re.M):
                                    try:
                                        h = House(self.co_index)
                                        h.info = final_html
                                        h.ho_name = l
                                        h.co_name = re.search(
                                            'html">(.*?)</a>', k,
                                            re.S | re.M).group(1)
                                        h.bu_num = re.search(
                                            '="absmiddle"  />(.*?)</a></strong></',
                                            j, re.S | re.M).group(1)
                                        h.insert_db()
                                    except Exception as e:
                                        continue
                            except Exception as e:
                                continue
                    except Exception as e:
                        continue
            except Exception as e:
                continue