示例#1
0
    def get_comm_info(self,comm_url_list):
        for comm_url  in comm_url_list:
            url = self.url + comm_url
            try:
                res = requests.get(url,headers=self.headers)
            except Exception as e:
                print("co_index={},小区信息错误".format(co_index),e)
                continue
            con = res.text
            co = Comm(co_index)
            co.co_id = re.search('Id=(\d+)',comm_url).group(1)
            co.co_name = re.search('项目名称.*?Name">(.*?)</span',con,re.S|re.M).group(1)
            co.co_develops = re.search('开发商.*?Name">(.*?)</span',con,re.S|re.M).group(1)
            co.co_address = re.search('地址.*?Address">(.*?)</span',con,re.S|re.M).group(1)
            co.co_build_size = re.search('建筑面积.*?jzmj">(.*?)</span',con,re.S|re.M).group(1)
            co.co_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1)
            co.co_size = re.search('占地面积.*?mzgm">(.*?)</span',con,re.S|re.M).group(1)
            co.co_green = re.search('绿化率.*?Jdl">(.*?)</span',con,re.S|re.M).group(1)
            co.co_volumetric = re.search('容积率.*?Rjl">(.*?)</span',con,re.S|re.M).group(1)
            co.co_build_start_time = re.search('开工日期.*?kgrq">(.*?)</span',con,re.S|re.M).group(1)
            co.co_build_end_time = re.search('竣工日期.*?syrq">(.*?)</span',con,re.S|re.M).group(1)

            co.insert_db()
            presell_url_list = re.findall('【<a href="(.*?)" target="_self"',con,re.S|re.M)
            self.get_build_info(presell_url_list,co.co_id)
示例#2
0
    def comm_info(
        self,
        con,
    ):
        # 小区及楼栋
        comm = Comm(co_index)

        comm.co_name = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()"
        )[0]  # 小区名称
        co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0]  # 小区id
        comm.co_id = re.search(r"\d+", co_id_str).group(0)
        comm.co_address = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[
                0]  # 小区地址
        comm.co_develops = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[
                0]  # 开发商
        comm.co_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0]  # 总面积
        comm.co_build_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[
                0]  # 建筑面积
        comm.co_build_end_time = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[
                0]  # 竣工时间
        comm.co_plan_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[
                0]  # 用地规划许可
        comm.co_work_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0]  # 施工许可
        comm.co_green = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()"
        )[0]  # 绿地百分比
        comm.co_land_use = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0]  # 土地使用证

        comm.insert_db()

        build = Building(co_index)
        build_table = con.xpath("//tr[@style='color:#000066;']")
        room_list = []
        for build_list in build_table:
            build.co_id = comm.co_id
            build.co_name = comm.co_name
            build_info = build_list.xpath("./td/text()")
            build.bu_id = build_info[0]
            build.bu_num = build_info[1]
            build.bu_all_house = build_info[2]
            build.size = build_info[3]
            build.bu_floor = build_info[4]
            build.bu_pre_sale = build_info[5]

            build.insert_db()

            room_url = build_list.xpath("./td/a/@href")[0]
            room_list.append(room_url)

        return room_list
示例#3
0
    def get_comm_info(self, comm_url_list):
        co = Comm(co_index)
        for url in comm_url_list:
            comm_url = url + "xinxi.html"
            try:
                res = requests.get(comm_url, headers=self.headers)
                con = res.text
                html = etree.HTML(con)
                co.co_id = re.search('/(\d+)', con).group(1)
                co.co_name = html.xpath("//h1[@class='fl']/a/@title")[0]
                co.co_address = re.search("楼盘地址.*?>(.*?)</li>", con).group(1)
                co.co_all_house = re.search("规划户数.*?>(.*?)</li>", con).group(1)
                co.co_develops = re.search("开 发 商.*?>(.*?)</li>", con).group(1)
                co.area = re.search("片区.*?>(.*?)</li>", con).group(1)
                co.co_type = re.search("项目类型.*?>(.*?)</li>", con).group(1)
                co.co_build_type = re.search("建筑类型.*?>(.*?)</li>",
                                             con).group(1)
                co.co_size = re.search("规划面积.*?>(.*?)</li>", con).group(1)
                co.co_build_size = re.search("建筑面积.*?>(.*?)</li>",
                                             con).group(1)
                try:
                    co.co_open_time = re.search("开盘时间.*?>(.*?)</li>",
                                                con).group(1)
                except:
                    co.co_open_time = None
                co.co_green = re.search("绿 化 率.*?>(.*?)</li>", con).group(1)
                co.co_volumetric = re.search("容 积 率.*?>(.*?)</li>",
                                             con).group(1)
                try:
                    co.co_build_start_time = re.search("开工时间:(.*?)</span>",
                                                       con).group(1)
                    co.co_build_end_time = re.search("竣工时间:(.*?)</span>",
                                                     con).group(1)
                except:
                    co.co_build_start_time = None
                    co.co_build_end_time = None

                co.insert_db()
            except:
                continue
示例#4
0
 def comm_parse(self, url_list):  # 小区信息解析
     co = Comm(co_index)
     # url_list = Queue()
     while True:
         url, area, type = url_list.get()
         try:
             res = requests.get(url, headers=self.headers)
         except Exception as e:
             print("co_index={},小区详情页无法访问".format(co_index), e)
             continue
         con = res.text
         co.area = area
         co.co_type = type
         co.co_id = re.search('id=(\d+)', url).group(1)
         co.co_develops = re.search('企业名称.*?>&nbsp;(.*?)<', con,
                                    re.S | re.M).group(1)
         co.co_name = re.search('项目名称.*?>&nbsp;(.*?)<', con,
                                re.S | re.M).group(1)
         co.co_address = re.search('项目座落.*?>&nbsp;(.*?)<', con,
                                   re.S | re.M).group(1)
         co.co_use = re.search('房屋用途.*?>&nbsp;(.*?)<', con,
                               re.S | re.M).group(1)
         try:
             co.co_pre_sale = re.search('许可证号.*?>&nbsp;(.*?)<', con,
                                        re.S | re.M).group(1)
         except:
             co.co_pre_sale = None
         new_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/p/ProjInfo.do?propid=" + co.co_id
         a_res = requests.get(new_url, headers=self.headers)
         a_con = a_res.text
         co.co_build_size = re.search('建筑面积.*?>&nbsp;(.*?)<', a_con,
                                      re.S | re.M).group(1)
         co.co_all_house = re.search('销售套数.*?>&nbsp;(.*?)<', a_con,
                                     re.S | re.M).group(1)
         co.co_green = re.search('绿化率.*?>&nbsp;(.*?)<', a_con,
                                 re.S | re.M).group(1)
         co.co_build_start_time = re.search('开工日期.*?>&nbsp;(.*?)<', a_con,
                                            re.S | re.M).group(1)
         co.co_build_end_time = re.search('竣工日期.*?>&nbsp;(.*?)<', a_con,
                                          re.S | re.M).group(1)
         co.co_volumetric = re.search('容积率.*?>&nbsp;(.*?)<', a_con,
                                      re.S | re.M).group(1)
         co.insert_db()
         global count
         count += 1
         print(count)
         try:
             self.build_parse(co.co_id, )
         except Exception as e:
             print("co_index={},楼栋信息错误".format(co_index), e)
示例#5
0
 def get_comm_info(self, comm_url, co_id):
     comm = Comm(co_index)
     response = requests.get(comm_url, headers=self.headers)
     html = response.text
     comm.co_name = re.search('项目名称:.*?class="left">(.*?)</td>', html, re.S | re.M).group(1)
     comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_build_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.area = re.search('所属片区:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_size = re.search('占地面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_id = co_id
     comm.insert_db()
示例#6
0
    def comm_info(self, url_list):
        for temp_url in url_list:
            try:
                comm = Comm(co_index)
                comm.co_id = re.search('Jh=(.*?\d+)', temp_url).group(1)
                parse_url = parse.quote(comm.co_id, encoding='gbk')
                comm_url = 'http://scxx.fgj.wuhan.gov.cn/3.asp?DengJh=' + parse_url
                proxy = Proxy_contact(app_name='wuhan',
                                      method='get',
                                      url=comm_url,
                                      headers=self.headers)
                res = proxy.contact()
                # res = requests.get(comm_url,headers=self.headers)
                con = res.decode('gb18030')
                # comm.co_id = re.search('Jh=(.*?)',temp_url).group(1)
                comm.co_name = re.search('项目名称.*?">(.*?)<', con,
                                         re.S | re.M).group(1)
                comm.co_all_house = re.search('套数.*?">(.*?)&nbsp', con,
                                              re.S | re.M).group(1)
                comm.co_address = re.search('坐落.*?">(.*?)</', con,
                                            re.S | re.M).group(1)
                comm.co_build_start_time = re.search('开工时间.*?">(.*?)</', con,
                                                     re.S | re.M).group(1)
                comm.co_build_end_time = re.search('竣工时间.*?">(.*?)</', con,
                                                   re.S | re.M).group(1)
                comm.co_size = re.search('用地面积.*?">(.*?)&nbsp', con,
                                         re.S | re.M).group(1)
                comm.co_build_size = re.search('建筑面积.*?">(.*?)&nbsp', con,
                                               re.S | re.M).group(1)
                comm.co_volumetric = re.search('容积率.*?">(.*?)</', con,
                                               re.S | re.M).group(1)
                comm.co_develops = re.search('开发企业</TD>.*?">(.*?)</TD', con,
                                             re.S | re.M).group(1)
                comm.co_land_use = re.search('土地使用证号.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                comm.co_plan_useland = re.search('用地规划许可证号.*?">(.*?)</', con,
                                                 re.S | re.M).group(1)
                comm.co_plan_project = re.search('工程规划许可证号.*?">(.*?)</', con,
                                                 re.S | re.M).group(1)
                comm.co_work_pro = re.search('施工许可证号.*?">(.*?)</', con,
                                             re.S | re.M).group(1)

                comm.insert_db()
                log.debug('{}插入成功'.format(comm.co_name))
            except Exception as e:
                log.error('小区错误{}'.format(e))
                continue
            build_detail = re.sub('3', '4', comm_url)
            self.build_info(build_detail, comm.co_id)
示例#7
0
 def get_comm_info(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/ProjectInfo.aspx?code=' + str(
                 i)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('id="PROJECT_XMMC1">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('id="PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('id="PROJECT_KFQY_NAME1">(.*?)<',
                                          html, re.S | re.M).group(1)
             comm.area = re.search('id="PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_build_size = re.search('id="PROJECT_GHZJZMJ">(.*?)<',
                                            html, re.S | re.M).group(1)
             comm.co_volumetric = re.search('id="PROJECT_RJL">(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_build_start_time = re.search(
                 'id="PROJECT_JHKGRQ">(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_end_time = re.search(
                 'id="PROJECT_JHJGRQ">(.*?)<', html, re.S | re.M).group(1)
             house_all = re.search('id="lbYsZZTs">(.*?)<', html,
                                   re.S | re.M).group(1)
             house_all_a = re.search('id="lbWsZZTs">(.*?)<', html,
                                     re.S | re.M).group(1)
             bus_all = re.search('id="lbWsSYTs">(.*?)<', html,
                                 re.S | re.M).group(1)
             bus_all_a = re.search('id="lbYsSYTs">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_all_house = int(house_all_a) + int(house_all) + int(
                 bus_all) + int(bus_all_a)
             area_size_a = re.search('id="lbYsZZMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_b = re.search('id="lbWsZZMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_c = re.search('id="lbWsSYMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_d = re.search('id="lbYsSYMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             comm.co_size = float(area_size_a) + float(area_size_b) + float(
                 area_size_c) + float(area_size_d)
             comm.co_id = str(i)
             comm.insert_db()
             self.get_build_info(comm.co_id)
         except Exception as e:
             print('小区 错误,co_index={},url={}'.format(co_index, comm_url), e)
示例#8
0
 def get_comm_detail(self, comm_detail_url, co_id):
     comm = Comm(co_index)
     try:
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_type = re.search('项目主体性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_all_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_id = co_id
         comm.insert_db()
         build_info_list = re.findall('id="lpan".*?</tr>', html, re.S | re.M)
         self.get_build_info(build_info_list, co_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
示例#9
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm.co_id = re.search('DevProjectId=(.*?)$', i[0]).group(1)
             comm.area = i[1]
             comm_url = 'http://58.51.240.121:8503/' + i[0]
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search(
                 'id="ProjectInfo1_lblProjectName">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_address = re.search(
                 'id="ProjectInfo1_lblProjectAddress">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_develops = re.search(
                 'id="ProjectInfo1_lblCorpName">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_type = re.search(
                 'id="ProjectInfo1_lblProjectType">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_size = re.search('id="ProjectInfo1_lblXmzgm">(.*?)<',
                                      html, re.S | re.M).group(1)
             comm.co_build_start_time = re.search(
                 'id="ProjectInfo1_lblJhkgrq">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_build_size = re.search(
                 'id="ProjectInfo1_lblZjzmj">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_build_end_time = re.search(
                 'id="ProjectInfo1_lblJhjfsyrq">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_volumetric = re.search(
                 'id="ProjectInfo1_lblRjl">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_green = re.search('id="ProjectInfo1_lblJdl">(.*?)<',
                                       html, re.S | re.M).group(1)
             build_url_list = re.findall(
                 'href="(Pub_ysxx\.aspx\?PresellId=.*?)"', html,
                 re.S | re.M)
             self.get_build_info(build_url_list, comm)
         except Exception as e:
             print('请求错误,co_index={},url={}'.format(co_index, comm_url), e)
示例#10
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         comm_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             comm = Comm(co_index)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<',
                                                html, re.S | re.M).group(1)
             comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<',
                                          html, re.S | re.M).group(1)
             comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1)
             comm.insert_db()
             build_url_list = re.findall(
                 'href="(/House/BuildingInfo\?buildingInfoID=.*?&amp;caseID=.*?)"',
                 html, re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
示例#11
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='ount">(\d+)</span></b>页',
        )
        page = b.get_page_count()
        list_formdata = {}
        for i in range(1, int(page) + 1):

            response = requests.post(self.url,
                                     data=list_formdata,
                                     headers=self.headers)
            con = etree.HTML(response.text)

            href_list = con.xpath("//strong/a/@href")
            view_state = con.xpath("//input[@id='__VIEWSTATE']/@value")[0]
            valid = con.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
            list_formdata["__VIEWSTATE"] = view_state  # 保存当前页的信息作为下一页请求参数
            list_formdata["__EVENTVALIDATION"] = valid
            list_formdata[
                "ctl00$ContentPlaceHolder1$PageNavigator_NewHouse1$txtNewPageIndex"] = i
            list_formdata[
                "__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$PageNavigator_NewHouse1$LnkBtnGoto"

            for href in href_list:
                new_url = self.url + href

                res = requests.get(new_url, headers=self.headers)
                comm_con = res.text

                detail_url = re.search('楼盘明细:.*?"(.*?)"', comm_con).group(1)
                detail_url = self.url + detail_url
                response = requests.get(detail_url)
                html = etree.HTML(response.text)
                comm_url_list = html.xpath(
                    "//div[@class='Search_results_box']//td/a/@href")

                for comm_url in comm_url_list:
                    commurl = self.url + comm_url
                    comm_res = requests.get(commurl, headers=self.headers)
                    comm_con = comm_res.text
                    bo_develops = re.search('开发企业.*?">(.*?)</td>', comm_con,
                                            re.S | re.M).group(1)
                    if bo_develops is None:
                        continue
                    else:
                        try:
                            co = Comm(co_index)
                            co.co_name = re.search('<h1>(.*?)<span>', comm_con,
                                                   re.S | re.M).group(1)
                            co.co_develops = bo_develops
                            co.co_id = re.search('MID=(\d+)',
                                                 comm_url).group(1)
                            co.co_use = re.search('规划用途.*?">(.*?)</td>',
                                                  comm_con,
                                                  re.S | re.M).group(1)
                            co.co_address = re.search('项目坐落.*?">(.*?)</td>',
                                                      comm_con,
                                                      re.S | re.M).group(1)
                            co.co_build_start_time = re.search(
                                '开工时间.*?">(.*?)</td>', comm_con,
                                re.S | re.M).group(1)
                            co.co_build_end_time = re.search(
                                '竣工时间.*?">(.*?)</td>', comm_con,
                                re.S | re.M).group(1)
                            co.co_size = re.search('土地面积.*?">(.*?)</td>',
                                                   comm_con,
                                                   re.S | re.M).group(1)
                            co.co_build_size = re.search(
                                '建筑面积.*?">(.*?)</td>', comm_con,
                                re.S | re.M).group(1)
                            co.co_all_house = re.findall(
                                '#eff6ff">(.*?)</td>', comm_con,
                                re.S | re.M)[0]
                            co.area = re.search('所属区域.*?">(.*?)</td>',
                                                comm_con, re.S | re.M).group(1)
                            co.insert_db()
                            global count
                            count += 1
                            print(count)
                            print(co.co_name)
                        except:
                            continue
                        self.build_crawler(co.co_id, co.co_name, comm_con)