示例#1
0
    def comm_info(self,url):
        comm_url = self.start_url + "/" + url
        res = requests.get(comm_url,headers=self.headers)
        res.encoding = 'gbk'
        con = res.text
        co = Comm(co_index)
        co.co_id = re.search('kfsid=(\d+)',url).group(1)
        co.co_name = re.search('itemname.*?">(.*?)</font',con).group(1)
        co.co_develops = re.search('开发商名称:.*?px;">(.*?)</a',con,re.S|re.M).group(1)
        co.co_all_house = re.search('总套数:.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_all_size = re.search('总面积:.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_residential_size = re.search('>住宅面积:.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_address = re.search('项目座落.*?;">(.*?)</',con,re.S|re.M).group(1)
        co.area = re.search('所在地区.*?">(.*?)</td',con,re.S|re.M).group(1)
        try:
            co.co_build_size = re.search('建筑面积.*?">(.*?)&nbsp', con, re.S | re.M).group(1)
            co.co_plan_project = re.search('建设工程规划许可证号.*?">(.*?)<br',con,re.S|re.M).group(1)
            co.co_land_use = re.search('土地证号.*?">(.*?)<br',con,re.S|re.M).group(1)
            co.co_work_pro = re.search('建筑工程施工许可证号.*?">(.*?)<br',con,re.S|re.M).group(1)
            co.co_use = re.search('用途.*?">(.*?)<br',con,re.S|re.M).group(1)
        except:
            co.co_build_size = None
            co.co_plan_project = None
            co.co_land_use = None
            co.co_work_pro = None
            co.co_us = None

        co.insert_db()
        co_html = etree.HTML(con)
        bu_list = co_html.xpath("//table[@id='preselltable1']/tr[@bgcolor='white']")
        self.build_info(bu_list,co.co_id)
示例#2
0
    def comm_info(self,comm_url_list):
        for comm_url in comm_url_list:
            try:
                co_url = 'http://222.77.178.63:7002/' + comm_url
                co_res = requests.get(co_url,headers=self.headers)
                con = co_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('projectID=(.*)',comm_url).group(1)
                co.co_name = re.search('项目名称:.*?">(.*?)</',con,re.S|re.M).group(1)
                co.area = re.search('所在区县:.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_address = re.search('项目地址:.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_develops = re.search('企业名称:.*?blank">(.*?)</',con,re.S|re.M).group(1)
                co.co_all_house = re.search('>总套数.*?">(\d+)<',con,re.S|re.M).group(1)
                co.co_all_size = re.search('>总面积.*?">(.*?)<',con,re.S|re.M).group(1)
                project_name = parse.quote(co.co_name)
                co.insert_db()
            except Exception as e:
                # log.error('小区信息错误{}'.format(e))
                print('小区信息错误{}'.format(e))

            sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" +co.co_id + "&projectname=" + project_name
            res = requests.get(sale_url,headers=self.headers)
            html = etree.HTML(res.content.decode('gbk'))
            temp_url_list = html.xpath("//a/@href")
            self.build_info(co.co_id,temp_url_list)
示例#3
0
 def start_crawler(self):
     res = requests.get(url, headers=self.headers)
     content = res.text
     page = re.search('页数:1/(.*?) ', content, re.S | re.M).group(1)
     for i in range(1, int(page) + 1):
         page_url = 'http://newhouse.ntfdc.net/house_certification.aspx?p=' + str(
             i)
         response = requests.get(page_url, headers=self.headers)
         html = response.text
         comm_html = re.search('class="layer-bd tb-style1">.*?</table>',
                               html, re.S | re.M).group()
         comm_info_list = re.findall('<tr>.*?</tr>', comm_html,
                                     re.S | re.M)[1:]
         for info in comm_info_list:
             try:
                 comm = Comm(co_index)
                 comm.co_pre_sale = re.search('<td.*?>(.*?)<', info,
                                              re.S | re.M).group(1)
                 comm.co_name = re.search('<td.*?<td.*?>(.*?)<', info,
                                          re.S | re.M).group(1)
                 comm.co_all_size = re.search('<td.*?<td.*?<td.*?>(.*?)<',
                                              info, re.S | re.M).group(1)
                 comm.co_type = re.search(
                     '<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                     re.S | re.M).group(1)
                 comm.co_pre_sale_date = re.search(
                     '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                     re.S | re.M).group(1)
                 comm.co_develops = re.search(
                     '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                     info, re.S | re.M).group(1)
                 comm.insert_db()
             except Exception as e:
                 print('小区错误,co_index={},url={}'.format(co_index, page_url),
                       e)
示例#4
0
    def comm_info(self, comm_url_list):
        for temp in comm_url_list:
            comm_url = "http://www.njhouse.com.cn/2016/spf/" + temp
            try:
                co = Proxy_contact(app_name="nanjing",
                                   method='get',
                                   url=comm_url,
                                   headers=self.headers)
                co_res = co.contact()
            except Exception as e:
                log.error("小区页面访问失败{}".format(e))
                continue
            con = co_res.decode('gbk')
            comm = Comm(co_index)
            comm.co_id = re.search('prjid=(\d+)" ta', con).group(1)
            comm.co_name = re.search('<h2>(.*?)<em>', con).group(1)
            comm.area = re.search("\[.*?'>(.*?)</a>\]", con).group(1)
            comm.co_develops = re.search('开发企业</td>.*?">(.*?)</a', con,
                                         re.S | re.M).group(1)
            comm.co_address = re.search('项目地址.*?<td>(.*?)</td', con,
                                        re.S | re.M).group(1)
            comm.co_open_time = re.search('开盘时间.*?<td>(.*?)</td', con,
                                          re.S | re.M).group(1)
            comm.co_use = re.search('用途.*?<td>(.*?)</td', con,
                                    re.S | re.M).group(1)
            comm.co_pre_sale = re.findall("'_blank'>(\d+)</a>", con)
            # comm.co_land_use = re.search('土地使用.*?span>(.*?)</span',con,re.S|re.M).group(1)
            comm.co_plan_project = re.search('工程规划.*?span>(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_plan_useland = re.search('用地规划.*?span>(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_work_pro = re.search('施工.*?span>(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_all_house = re.search('入网总套数.*?">(.*?)</td', con,
                                          re.S | re.M).group(1)
            comm.co_all_size = re.search('入网总面积.*?td>(.*?)m', con,
                                         re.S | re.M).group(1)
            comm.insert_db()

            build_temp = "http://www.njhouse.com.cn/2016/spf/sales.php?prjid=" + str(
                comm.co_id)
            while True:
                try:
                    build_proxy = Proxy_contact(app_name="nanjing",
                                                method='get',
                                                url=build_temp,
                                                headers=self.headers)
                    build_temp_con = build_proxy.contact()
                    build_temp_con = build_temp_con.decode('gbk')
                    html = etree.HTML(build_temp_con)
                    break
                except:
                    continue
            build_url_list = html.xpath("//div[@class='fdxs_left']/a/@href")
            self.build_info(build_url_list, comm.co_id)
示例#5
0
 def get_comm_info(self, comm_info_list):
     for i in comm_info_list:
         try:
             comm = Comm(co_index)
             comm.co_name = re.search('<td>(.*?)</td>', i,
                                      re.S | re.M).group(1)
             comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i,
                                           re.S | re.M).group(1)
             comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i,
                                          re.S | re.M).group(1)
             comm.insert_db()
         except Exception as e:
             print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
示例#6
0
    def get_comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            comm_detail = "http://xx.yyfdcw.com" + comm_url
            try:
                comm_res = requests.get(comm_detail, headers=self.headers)
            except Exception as e:
                print("co_index={},小区详情页无法访问".format(co_index), e)
                continue
            con = comm_res.text
            comm = Comm(co_index)
            comm.co_id = re.search('ID=(\d+)', con).group(1)
            comm.co_name = re.search('lpname">.*?<h2>(.*?)</h2', con,
                                     re.S | re.M).group(1)
            comm.co_develops = re.search('开发商:.*?Kfs">(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_green = re.search('绿化率:.*?Lhl">(.*?)</span', con,
                                      re.S | re.M).group(1)
            comm.area = re.search('区域:.*?Name">(.*?)</span', con,
                                  re.S | re.M).group(1)
            comm.co_address = re.search('位置:</b>(.*?)</li', con,
                                        re.S | re.M).group(1)
            comm.co_build_size = re.search('建筑面积:.*?l5">(.*?)</span', con,
                                           re.S | re.M).group(1)
            comm.co_all_house = re.search('总户数:.*?hs">(.*?)</span', con,
                                          re.S | re.M).group(1)
            comm.co_plan_useland = re.search('用地.*?l4">(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_plan_project = re.search('工程.*?l3">(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_build_type = re.search('楼盘类型.*?Type">(.*?)</span', con,
                                           re.S | re.M).group(1)
            comm.co_all_size = re.search('占地面积.*?mianji">(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_land_use = re.search('使用权证.*?l1">(.*?)</span', con,
                                         re.S | re.M).group(1)

            comm.insert_db()
            try:
                build_list = re.findall(
                    '<td align="center">.*?<a href="(.*?)"', con, re.S | re.M)
                if len(build_list) > 0:
                    self.get_build_info(build_list, comm.co_id)
                else:
                    print("co_index={},小区co_id={}没有楼栋".format(
                        co_index, comm.co_id))
                    continue
            except:
                print("co_index={},小区co_id={}没有楼栋".format(
                    co_index, comm.co_id))
                continue
示例#7
0
 def get_comm_detail(self, comm_detail_url, co_id):
     comm = Comm(co_index)
     try:
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_type = re.search('项目主体性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_all_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_id = co_id
         comm.insert_db()
         build_info_list = re.findall('id="lpan".*?</tr>', html, re.S | re.M)
         self.get_build_info(build_info_list, co_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
示例#8
0
    def comm_info(self, co_id):
        comm_url = "http://www.lsjs.gov.cn/WebLSZFGB/LPDetail.aspx?RowGuid=" + co_id
        co_res = requests.get(comm_url, headers=self.headers)
        con = co_res.text
        co = Comm(co_index)
        co.co_name = re.search('楼 盘 名 称:(.*?)<br', con).group(1)
        co.co_id = co_id
        co.area = re.search('所 属 城 区:.*?">(.*?)</span', con).group(1)
        co.co_address = re.search('楼 盘 坐 落:.*?">(.*?)</span', con).group(1)
        co.co_develops = re.search('项 目 公 司:.*?mc">(.*?)</span', con,
                                   re.S | re.M).group(1)
        co.co_pre_sale = re.search('预销售证号.*?">(.*?)</span', con,
                                   re.S | re.M).group(1)
        co.co_all_house = re.search('预售总套数.*?td>(.*?)</td', con,
                                    re.S | re.M).group(1)
        co.co_all_size = re.search('预售总面积.*?td>(.*?)</td', con,
                                   re.S | re.M).group(1)
        co.co_pre_sale_date = re.search('时间.*?">(.*?)</span', con,
                                        re.S | re.M).group(1)
        co.insert_db()

        url = 'http://www.lsjs.gov.cn/WebLSZFGB/Ashx/YSXM.ashx'
        count = 1
        while True:
            data = {
                "method": "getzxl",
                "PageSize": 5,
                "CurrentPageIndex": str(count),
                "YSXMID": co_id,
                # 'Searchkey':''
            }
            res = requests.post(url, data=data, headers=self.headers)
            con_dict = json.loads(res.text)
            num = con_dict["data"][0]['TotalNum']
            info_list = con_dict["data"][1:]
            for info in info_list:
                bu_id = info["YSZID"]
                self.build_info(co_id, bu_id)
            if int(num) < count * 5:
                break
            else:
                count += 1
                continue
示例#9
0
 def get_comm_info(self, comm_url):
     comm = Comm(co_index)
     comm_url = comm_url.replace('buildingdetail', 'buildinfo')
     response = self.request_proxy(comm_url, headers=self.headers)
     html = response.content.decode('gbk')
     comm.co_name = re.search('class="sf_xq_xmmc">(.*?)<', html, re.S | re.M).group(1).strip()
     comm.area = re.search('id="Label_CityArea">(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_pre_sale_date = re.search('class="sf_xq_jfsj">(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_build_type = re.search('id="lbl_JZJG".*?>(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_address = re.search('id="Label_ProjectAdress">(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_pre_sale = re.search('id="Label_SallPreDocuments">(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_all_house = re.search('id="lbl_ZTS".*?>(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_build_size = re.search('id="lbl_JZMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_all_size = re.search('id="lbl_ZDMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_develops = re.search('id="Label_DevName">.*?>(.*?)<', html, re.S | re.M).group(1).strip()
     comm.co_id = re.search('action=.*?buildingid=(.*?)"', html, re.S | re.M).group(1).strip()
     comm.insert_db()
     buildingid = re.search('buildingid=(.*?)$', comm_url, re.S | re.M).group(1)
     self.get_build_info(buildingid, comm.co_id)
示例#10
0
 def start_crawler(self):
     for i in self.area_list:
         data = {'districtID': i}
         res = requests.post(url='http://www.fangdi.com.cn/complexPro.asp',
                             data=data)
         html_str = res.content.decode('gbk')
         # 根据返回结果 获取每个地区的返回分页
         url_list = re.findall('value="(/complexpro.*?)"', html_str,
                               re.S | re.M)
         for k in url_list:
             response = requests.get('http://www.fangdi.com.cn' + k,
                                     headers=self.headers)
             html = response.content.decode('gbk')
             comm_html = re.search('位置<.*?页/共', html, re.S | re.M).group()
             comm_info_list = re.findall('<tr valign=.*?</tr>', comm_html,
                                         re.S | re.M)
             for info in comm_info_list:
                 try:
                     comm = Comm(co_index)
                     comm_url = re.search('<a href=(.*?)>', info,
                                          re.S | re.M).group(1)
                     comm.co_name = re.search('<a.*?>(.*?)<', info,
                                              re.S | re.M).group(1)
                     comm.co_address = re.search('<a.*?<td.*?>(.*?)<', info,
                                                 re.S | re.M).group(1)
                     comm.co_all_house = re.search(
                         '<a.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.co_all_size = re.search(
                         '<a.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.area = re.search(
                         '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.co_id = re.search('projectID=(.*?)==', info,
                                            re.S | re.M).group(1)
                     self.get_comm_info(comm_url, comm)
                 except Exception as e:
                     print(
                         '小区错误,co_index={},url={}'.format(
                             co_index, 'http://www.fangdi.com.cn' + k), e)
示例#11
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         comm_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             comm = Comm(co_index)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<',
                                                html, re.S | re.M).group(1)
             comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<',
                                          html, re.S | re.M).group(1)
             comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1)
             comm.insert_db()
             build_url_list = re.findall(
                 'href="(/House/BuildingInfo\?buildingInfoID=.*?&amp;caseID=.*?)"',
                 html, re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
示例#12
0
    def get_comm_info(self, comm_res, co_id):
        comm = Comm(co_index)
        con = comm_res.text
        comm.co_name = re.search('项目名称.*?">(.*?)<', con, re.S | re.M).group(1)
        comm.co_id = co_id
        comm.co_address = re.search('项目地址.*?<td>(.*?)<', con,
                                    re.S | re.M).group(1)
        comm.co_develops = re.search('开 发 商:.*?<td.*?>(.*?)<', con,
                                     re.S | re.M).group(1)
        comm.co_all_size = re.search('建设用地面积.*?<td>(.*?)</td>', con,
                                     re.S | re.M).group(1)
        comm.co_size = re.search('占地面积.*?<td>(.*?)</td>', con,
                                 re.S | re.M).group(1)
        comm.co_build_size = re.search('项目总建筑面积:.*?<td>(.*?)</td>', con,
                                       re.S | re.M).group(1)
        comm.co_land_use = re.search('土地使用证号.*?<td>(.*?)<', con,
                                     re.S | re.M).group(1)
        comm.co_plan_pro = re.search('规划许可证号.*?<td>(.*?)<', con,
                                     re.S | re.M).group(1)
        comm.insert_db()

        build_id_list = re.findall("onclick=.doview\('(\d+)'\)", con,
                                   re.S | re.M)
        self.get_build_info(build_id_list, co_id)
示例#13
0
    def get_comm_info(self, comm_info):

        co = Comm(co_index)
        co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1)
        try:
            co.co_address = re.findall('px">(.*?)</td', comm_info)[1]
        except:
            co.co_address = None
        co.area = re.search('center">(.*?)</td>', comm_info).group(1)
        co_detail_url = re.search("href='(.*?)'", comm_info).group(1)
        co_url = "http://www.qyfgj.cn/newys/" + co_detail_url
        try:
            res = requests.get(co_url, headers=self.headers)
        except Exception as e:
            print("co_index={}小区未请求到".format(co_index), e)
        con = res.content.decode('gbk')
        try:
            co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con,
                                       re.S | re.M).group(1)
            co.co_all_house = re.search('总套数.*?">(\d+)&nbsp', con,
                                        re.S | re.M).group(1)
            co.co_all_size = re.search('总面积.*?">(\d+.\d+)&nbsp;m', con,
                                       re.S | re.M).group(1)
        except:
            print("小区无开发商等信息")
        co.insert_db()

        try:
            build = re.findall('<tr bgcolor="white">(.*?)</tr>', con,
                               re.S | re.M)
        except:
            print("小区没有楼栋信息")
        build_headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
            'Referer': co_url
        }

        for build_info in build:
            if "进入" in build_info:
                build_url = re.search('href="(.*?)"><font',
                                      build_info).group(1)
                build_url = "http://www.qyfgj.cn/newys/" + build_url
                ho_headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                    'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                    'Referer': build_url
                }
                build_res = requests.get(build_url, headers=build_headers)
                build_con = build_res.content.decode('gbk')

                if re.search('ID=(\d+)', build_url):  #现售
                    bu = Building(co_index)
                    bu_id = re.search('ID=(\d+)', build_url).group(1)
                    bu.bu_id = bu_id
                    bu.co_name = co.co_name
                    bu.insert_db()
                    self.get_house_info(headers=ho_headers,
                                        bu_id=bu_id,
                                        url=build_url)

                else:  #预售
                    bu = Building(co_index)
                    bu.co_name = co.co_name
                    bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con,
                                           re.S | re.M).group(1)
                    bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>',
                                               build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>',
                                                    build_con,
                                                    re.S | re.M).group(1)
                    bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con,
                                              re.S | re.M).group(1)
                    ret = re.findall('<tr onmouseover(.*?)</tr', build_con,
                                     re.S | re.M)
                    for i in ret:
                        house_url = re.search('href="(.*?)"', i).group(1)
                        house_url = "http://www.qyfgj.cn/newys/" + house_url
                        bu.bu_id = re.search('dbh=(.*?)&', i).group(1)
                        bu.bu_num = re.search('<td width="89.*?">(.*?)</',
                                              i).group(1)
                        bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',
                                                i).group(1)
                        bu.insert_db()

                        ho_res = requests.get(house_url, headers=ho_headers)
                        ho_con = ho_res.content.decode('gbk')
                        new_headers = {
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                            'Cookie':
                            'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                            'Referer': house_url
                        }
                        self.get_house_info(ho_con=ho_con,
                                            headers=new_headers,
                                            bu_id=bu.bu_id)
            else:
                print("楼栋无链接地址")
示例#14
0
    def comm_parse(self, co_name, co_addr, co_area, co_url):
        co_res = requests.get(co_url, headers=self.headers)
        co_res.encoding = 'gbk'
        con = co_res.text
        co = Comm(co_index)
        if re.search('开发商名称.*?;">(.*?)</', con, re.S | re.M):
            co.co_develops = re.search('开发商名称.*?;">(.*?)</', con,
                                       re.S | re.M).group(1)
        else:
            co.co_develops = None

        kfsid = re.search('kfsid=(\d+)', co_url).group(1)
        co.co_id = co_name + kfsid
        co.co_name = co_name
        co.co_address = co_addr
        co.area = co_area
        co.co_all_house = re.search('总套数.*?">(\d+)&nbsp', con,
                                    re.S | re.M).group(1)
        co.co_all_size = re.search('总面积.*?">(.*?)&nbsp', con,
                                   re.S | re.M).group(1)
        co.co_residential_size = re.search('住宅面积.*?">(.*?)&nbsp', con,
                                           re.S | re.M).group(1)
        co.insert_db()
        num = 1
        while True:
            pre_url = co_url + "&ypage=" + str(num)  # 预售翻页
            pre_res = requests.get(pre_url, headers=self.headers)
            pre_con = pre_res.content.decode('gbk')
            pre_html = etree.HTML(pre_con)
            if pre_html.xpath(
                    "//table[@id='preselltable1']//tr[@bgcolor='white']"):
                pre_list = pre_html.xpath(
                    "//table[@id='preselltable1']//tr[@bgcolor='white']")
                num += 1
                for pre in pre_list:
                    bu_url = pre.xpath("./td[4]/a/@href")[0]
                    if 'user_Presell' in bu_url:
                        self.bu_parse(bu_url, co.co_id, co_url)
                    else:
                        continue
            else:
                break

        while True:
            sell_url = co_url + "&page=" + str(num)  # 现售翻页
            sell_res = requests.get(sell_url, headers=self.headers)
            sell_con = sell_res.content.decode('gbk')
            sell_html = etree.HTML(sell_con)
            if sell_html.xpath(
                    "//table[@id='selltable1']//tr[@bgcolor='white']"):
                sell_list = sell_html.xpath(
                    "//table[@id='selltable1']//tr[@bgcolor='white']")
                num += 1
                for sell in sell_list:
                    ho_url = sell.xpath("./td/a/@href")[0]
                    if 'user_sell' in ho_url:
                        bu_id = re.search('ID=(.*?)&', ho_url).group(1)
                        self.house_parse(ho_url, co.co_id, bu_id)
                    else:
                        continue
            else:
                break