Exemplo n.º 1
0
    def co_parse(self,url_list):
        for url in url_list:
            try:
                co_url = url.xpath("./@href")[0]
                new_url = "http://tmsf.qzfdcgl.com" + co_url
                co_res = requests.get(new_url,headers=self.headers)
                con = co_res.text
                co = Comm(co_index)
                co.co_id = re.search('property_(.*?)_info',co_url).group(1)
                co.co_name = re.search('楼盘名称:</span>(.*)',con).group(1)
                co.co_develops = re.search('项目公司:</span>(.*)',con).group(1)
                co.co_address = re.search('物业地址:</span>(.*?)</p',con,re.S|re.M).group(1)
                co.area = re.search('所属城区:</span>(.*)',con).group(1)
                co.insert_db()
                sid = re.search('property_(\d+)_',co_url).group(1)
                propertyid = re.search('(\d+)_info',co_url).group(1)
                bu_url = new_url.replace('info','price')
                res = requests.get(bu_url,headers=self.headers)
                bu_html = etree.HTML(res.text)
                bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a")
            except:
                continue
            for bu_ in bu_idlist[1:]:
                id = bu_.xpath("./@id")[0]
                bu_id = re.search('.*?(\d+)',id).group(1)
                bu = Building(co_index)
                bu.bu_id = bu_id
                bu.co_id = co.co_id
                bu.bu_num = bu_.xpath("./text()")[0]

                bu.insert_db()
                self.house_parse(bu_id,co.co_id,sid,propertyid)
Exemplo n.º 2
0
    def comm_info(self,comm_url_list):
        for comm_url in comm_url_list:
            try:
                co_url = 'http://222.77.178.63:7002/' + comm_url
                co_res = requests.get(co_url,headers=self.headers)
                con = co_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('projectID=(.*)',comm_url).group(1)
                co.co_name = re.search('项目名称:.*?">(.*?)</',con,re.S|re.M).group(1)
                co.area = re.search('所在区县:.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_address = re.search('项目地址:.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_develops = re.search('企业名称:.*?blank">(.*?)</',con,re.S|re.M).group(1)
                co.co_all_house = re.search('>总套数.*?">(\d+)<',con,re.S|re.M).group(1)
                co.co_all_size = re.search('>总面积.*?">(.*?)<',con,re.S|re.M).group(1)
                project_name = parse.quote(co.co_name)
                co.insert_db()
            except Exception as e:
                # log.error('小区信息错误{}'.format(e))
                print('小区信息错误{}'.format(e))

            sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" +co.co_id + "&projectname=" + project_name
            res = requests.get(sale_url,headers=self.headers)
            html = etree.HTML(res.content.decode('gbk'))
            temp_url_list = html.xpath("//a/@href")
            self.build_info(co.co_id,temp_url_list)
Exemplo n.º 3
0
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://house.bffdc.gov.cn/public/project/' + i
             response = requests.get(comm_url)
             html = response.text
             comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.area = re.search('PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.insert_db()
             build_info = re.search('id="buildInfo".*?value="(.*?)"', html,
                                    re.S | re.M).group(1)
             build_url_list = build_info.split(';;')
             self.get_build_info(build_url_list, comm.co_name)
             global count
             count += 1
             print(count)
         except Exception as e:
             print(e)
Exemplo n.º 4
0
 def get_comm_detail(self, comm_detail_url):
     comm_url = 'http://www.kmhouse.org' + comm_detail_url
     try:
         comm = Comm(co_index)
         response = requests.get(comm_url, headers=self.headers)
         html = response.content.decode('gbk')
         co_id = re.search('Preid=(.*?)&', comm_detail_url).group(1)
         co_name = re.search('楼盘名称.*?<td.*?>(.*?)<', html,
                             re.S | re.M).group(1)
         area = re.search('所在地区.*?<td.*?>(.*?)<', html,
                          re.S | re.M).group(1)
         co_address = re.search('楼盘地址.*?<td.*?>(.*?)<', html,
                                re.S | re.M).group(1)
         co_pre_sale = re.search('预售证号.*?<td.*?>(.*?)<', html,
                                 re.S | re.M).group(1)
         co_volumetric = re.search('容&nbsp;积&nbsp;率.*?<td.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
         co_green = re.search('绿&nbsp;化&nbsp;率.*?<td.*?>(.*?)<', html,
                              re.S | re.M).group(1)
         co_build_start_time = re.search('开工时间.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
         comm.co_name = co_name
         comm.area = area
         comm.co_id = co_id
         comm.co_address = co_address
         comm.co_pre_sale = co_pre_sale
         comm.co_volumetric = co_volumetric
         comm.co_green = co_green
         comm.co_build_start_time = co_build_start_time
         comm.insert_db()
         global count
         count += 1
         print('count:', count)
     except Exception as e:
         print('小区详情错误,co_index={},url={}'.format(co_index, comm_url), e)
Exemplo n.º 5
0
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         comm_url = 'http://www.yzfdc.cn/' + i
         try:
             comm = Comm(co_index)
             content = self.s.get(comm_url, headers=self.headers)
             html = content.text
             comm.co_name = re.search('class="zxlp_08".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_id = re.search(
                 'class="zxlp_08" href=.*?ProjectId=(.*?)"', html,
                 re.S | re.M).group(1)
             comm.co_develops = re.search('开 发 商:.*?<span.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_type = re.search('项目类型:.*?<span.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.area = re.search('所属区位:.*?<span.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_build_size = re.search('建筑面积:.*?<span.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_open_time = re.search('开盘日期:.*?<span.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_handed_time = re.search('交付日期:.*?<span.*?>(.*?)<',
                                             html, re.S | re.M).group(1)
             comm.co_address = re.search('项目具体地址:.*?<span.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.insert_db()
             build_url = re.search(
                 '(/BuildingDish_Publicity.aspx\?Projectid=.*?)"', html,
                 re.S | re.M).group(1)
             self.get_build_info(build_url, comm.co_id)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Exemplo n.º 6
0
 def get_comm_detail(self, comm_url):
     comm = Comm(co_index)
     co_url = 'http://tz.tmsf.com' + comm_url
     response = requests.get(co_url, headers=self.headers)
     html = response.content.decode('utf-8')
     comm.co_name = re.search('<span class="buidname colordg">(.*?)<', html,
                              re.S | re.M).group(1)
     comm.co_address = re.search('楼盘地址:.*?<span.*?>(.*?)<', html,
                                 re.S | re.M).group(1)
     if '[' in comm.co_address:
         comm.area = re.search('\[(.*?)\]', comm.co_address,
                               re.S | re.M).group(1)
     comm.co_type = re.search('物业类型:.*?<span title="(.*?)"', html,
                              re.S | re.M).group(1)
     comm.co_open_time = re.search('最新开盘:</strong>(.*?)<', html,
                                   re.S | re.M).group(1)
     comm.co_develops = re.search('项目公司:</strong>(.*?)<', html,
                                  re.S | re.M).group(1)
     comm.co_build_type = re.search('建筑形式:</strong>(.*?)<', html,
                                    re.S | re.M).group(1)
     comm.co_id = re.search('id="propertyid".*?value="(.*?)"', html,
                            re.S | re.M).group(1)
     comm.insert_db()
     sid = re.search('id="sid" name="sid" value="(.*?)"', html,
                     re.S | re.M).group(1)
     build_url = re.search('id="index_bar">楼盘主页.*?href="(.*?)"', html,
                           re.S | re.M).group(1)
     self.get_build_info(build_url, comm.co_id, sid)
Exemplo n.º 7
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm.co_id = '楼盘首页.*?aid-(.*?)/'
             comm.co_name = 'class="ls">(.*?)<'
             comm.co_type = '物业类型</em>(.*?)<'
             comm.area = '区域所属:</em>(.*?)<'
             comm.co_green = '绿 化 率:</em>(.*?)<'
             comm.co_volumetric = '容 积 率:</em>(.*?)<'
             comm.co_build_type = '楼&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;层:</em>(.*?)<'
             comm.co_size = '占地面积:</em>(.*?)<'
             comm.co_build_size = '建筑面积:</em>(.*?)<'
             comm.co_develops = '开&nbsp;&nbsp;发&nbsp;&nbsp;商:</em><.*?target="_blank">(.*?)<'
             comm.co_address = '项目地址:</em>(.*?)<'
             data_list = comm.to_dict()
             p = ProducerListUrl(
                 page_url=i,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=data_list,
                 current_url_rule=
                 'colspan="3" align="right"><a href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             more_build_url = p.get_details()
             self.get_build_info(more_build_url)
         except Exception as e:
             print(e)
Exemplo n.º 8
0
 def comm_parse(self, url_list, region):
     for co_url in url_list:
         comm_url = "http://110.89.45.7:8082" + co_url
         comm_res = requests.get(comm_url, headers=self.headers)
         con = comm_res.text
         co = Comm(co_index)
         co.co_id = re.search('ProjectId=(.*)', co_url).group(1)
         co.co_name = re.search('项目名称.*?">(.*?)</td', con,
                                re.S | re.M).group(1)
         co.co_develops = re.search('公司名称.*?">(.*?)</td', con,
                                    re.S | re.M).group(1)
         co.co_address = re.search('项目坐落.*?">(.*?)</td', con,
                                   re.S | re.M).group(1)
         co.co_use = re.search('规划用途.*?">(.*?)</td', con,
                               re.S | re.M).group(1)
         co.co_build_size = re.search('建筑面积.*?">(.*?)</td', con,
                                      re.S | re.M).group(1)
         co.area = region
         co.co_residential_size = re.search(
             '批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1)
         co.co_pre_sale = re.search('预售许可证.*?">(.*?)</td', con,
                                    re.S | re.M).group(1)
         co.insert_db()
         co_html = etree.HTML(comm_res.text)
         bu_urllist = co_html.xpath("//span/a/@href")
         self.bu_parse(co.co_id, bu_urllist)
Exemplo n.º 9
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             response = requests.get(i, headers=self.headers)
             html = response.text
             comm = Comm(co_index)
             comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<',
                                           html, re.S | re.M)[0]
             comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html,
                                          re.S | re.M)[0]
             comm.area = re.findall('PROJECT_SZQY">(.*?)<', html,
                                    re.S | re.M)[0]
             comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html,
                                             re.S | re.M)[0]
             comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<',
                                             html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html,
                                           re.S | re.M)[0]
             comm.co_id = re.findall('PROJECT_XMBH">(.*?)<', html,
                                     re.S | re.M)[0]
             comm.insert_db()
             global count
             count += 1
             print(count)
             bu_info = re.search('id="buildInfo".*?value="(.*?)"', html,
                                 re.S | re.M).group(1)
             self.get_build_info(bu_info, comm.co_id, i)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, i), e)
Exemplo n.º 10
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = i.replace('view', 'detail')
             comm.co_type = '物业类型:.*?<dd>(.*?)<'
             comm.area = '区域所属:.*?<dd>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<dd>(.*?)<'
             comm.co_size = '占地面积:.*?<dd>(.*?)<'
             comm.co_green = '绿化率:.*?<dd><.*?>(.*?)<'
             comm.co_build_type = '楼  层:.*?<dd>(.*?)<'
             comm.co_volumetric = '容积率:.*?<dd><.*?>(.*?)<'
             comm.co_id = '楼盘首页.*?newhouse/.*?/(.*?)/'
             comm.co_name = '<h1 class="title">(.*?)<'
             comm.co_address = '楼盘地址:.*?<dd>(.*?)<'
             comm.co_develops = '开发商:.*?<dd(.*?)<'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='gbk',
                                 analyzer_rules_dict=comm.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print(e)
Exemplo n.º 11
0
    def comm_info(self,url):
        comm_url = self.start_url + "/" + url
        res = requests.get(comm_url,headers=self.headers)
        res.encoding = 'gbk'
        con = res.text
        co = Comm(co_index)
        co.co_id = re.search('kfsid=(\d+)',url).group(1)
        co.co_name = re.search('itemname.*?">(.*?)</font',con).group(1)
        co.co_develops = re.search('开发商名称:.*?px;">(.*?)</a',con,re.S|re.M).group(1)
        co.co_all_house = re.search('总套数:.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_all_size = re.search('总面积:.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_residential_size = re.search('>住宅面积:.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_address = re.search('项目座落.*?;">(.*?)</',con,re.S|re.M).group(1)
        co.area = re.search('所在地区.*?">(.*?)</td',con,re.S|re.M).group(1)
        try:
            co.co_build_size = re.search('建筑面积.*?">(.*?)&nbsp', con, re.S | re.M).group(1)
            co.co_plan_project = re.search('建设工程规划许可证号.*?">(.*?)<br',con,re.S|re.M).group(1)
            co.co_land_use = re.search('土地证号.*?">(.*?)<br',con,re.S|re.M).group(1)
            co.co_work_pro = re.search('建筑工程施工许可证号.*?">(.*?)<br',con,re.S|re.M).group(1)
            co.co_use = re.search('用途.*?">(.*?)<br',con,re.S|re.M).group(1)
        except:
            co.co_build_size = None
            co.co_plan_project = None
            co.co_land_use = None
            co.co_work_pro = None
            co.co_us = None

        co.insert_db()
        co_html = etree.HTML(con)
        bu_list = co_html.xpath("//table[@id='preselltable1']/tr[@bgcolor='white']")
        self.build_info(bu_list,co.co_id)
Exemplo n.º 12
0
 def get_comm_info(self, comm_url, area):
     try:
         comm = Comm(co_index)
         comm.area = area.strip()
         comm_url = comm_url.replace('..', '')
         response = self.s.get(comm_url)
         html = response.text
         comm.co_name = re.findall('项目名称:.*?<TD.*?><FONT.*?>(.*?)<', html,
                                   re.S | re.M)[0].strip()
         comm.co_address = re.findall('项目地址:.*?<TD.*?>(.*?)<', html,
                                      re.S | re.M)[0].strip()
         comm.co_develops = re.findall('开发公司:.*?<TD.*?>(.*?)<', html,
                                       re.S | re.M)[0].strip()
         comm.co_pre_sale = re.findall('预售证名称:.*?<TD.*?>(.*?)<', html,
                                       re.S | re.M)[0].strip()
         comm.co_build_size = re.findall('纳入网上可售面积:.*?<TD.*?>(.*?)<', html,
                                         re.S | re.M)[0].strip()
         comm.co_id = re.search('\?(.*?)$', comm_url).group(1)
         comm.insert_db()
         global count
         count += 1
         print(count)
         build_url_list = re.findall("(HouseList/HouseInfo.aspx\?.*?)'",
                                     html, re.S | re.M)
         self.get_build_url(build_url_list, comm.co_id)
     except Exception as e:
         print(e)
Exemplo n.º 13
0
 def baiyin_start(self):
     page = self.get_all_page()
     print(page)
     for i in range(1, int(page) + 1):
         res = requests.get(self.url + '?page=' + str(i),
                            headers=self.headers)
         html = res.content.decode('gbk')
         tree = etree.HTML(html)
         community_list = tree.xpath('//tr[@align="center"]')
         for i in community_list[1:]:
             try:
                 comm = Comm(self.CO_INDEX)
                 href = i.xpath('td/a/@href')
                 area = i.xpath('td[1]/text()')
                 if not area:
                     area = None
                 else:
                     area = area[0]
                 href = href[0]
                 comm.area = area
                 self.get_comm_detail(href, comm)
             except Exception as e:
                 href = i.xpath('td/a/@href')
                 if not href:
                     continue
                 href = href[0]
                 comm_url = self.URL_FRONT + href
                 print('小区错误:', comm_url)
                 print(e)
Exemplo n.º 14
0
 def get_comm_info(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.ytfcjy.com/public/project/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0]
             comm.co_id = re.findall('ProjectInfo.aspx\?code=(.*?)&', html, re.S | re.M)[0]
             comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0]
             comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0]
             comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0]
             comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0]
             comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0]
             comm.co_all_house = re.findall('YSZTS">(.*?)<', html, re.S | re.M)[0]
             comm.co_plan_pro = re.findall('id="ghxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.co_work_pro = re.findall('id="sgxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.co_land_use = re.findall('id="tdzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.insert_db()
             global count
             count += 1
             print(count)
             build_url_list = re.findall('id="buildInfo" value="(.*?)"', html, re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print(e)
Exemplo n.º 15
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://221.2.144.162:8090/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.content.decode('gbk')
             comm.co_id = re.search('id=(\d+)', i).group(1)
             comm.co_name = re.findall('项目名称:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_develops = re.findall('开 发 商:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.area = re.findall(
                 '城 &nbsp;&nbsp;&nbsp;区:.*?<td.*?>(.*?)<', html,
                 re.S | re.M)[0]
             comm.co_type = re.findall('物业类型:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_address = re.findall('物业位置:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0]
             comm.co_build_size = re.findall('建筑面积:.*?<td.*?>(.*?)<', html,
                                             re.S | re.M)[0]
             comm.insert_db()
             build_url_list = re.findall("height=20.*?<a href=(.*?) ", html,
                                         re.S | re.M)
             bu_pre_sale_list = re.findall("height=20.*?<Td>(.*?)<", html,
                                           re.S | re.M)
             self.get_build_info(build_url_list, bu_pre_sale_list,
                                 comm.co_name, comm.co_id)
         except Exception as e:
             print("co_index={},小区信息错误".format(co_index), e)
Exemplo n.º 16
0
    def start_crawler(self):
        for region in self.region.items():
            region_code = region[0]
            region_name = region[1]
            url = self.start_url + region_code + '.html'
            b = AllListUrl(
                first_page_url=url,
                request_method='get',
                analyzer_type='regex',
                encode='utf-8',
                page_count_rule='共(\d+)页>',
            )
            page = b.get_page_count()
            for i in range(1, int(page) + 1):
                new_url = url + "?page=" + str(i)
                res = requests.get(new_url, headers=self.headers)
                html = etree.HTML(res.text)
                co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']")
                for co in co_list:
                    comm = Comm(co_index)
                    co_url = co.xpath("./dt/h4/a/@href")[0]
                    comm.co_name = co.xpath("./dt/h4/a/text()")[0]
                    comm.co_address = co.xpath(".//address/text()")[0]
                    comm.co_id = re.search('\d+', co_url).group(0)
                    comm.co_develops = co.xpath(
                        "./dd[@class='dev']/a/text()")[0]
                    comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0]
                    comm.co_type = co.xpath(".//p/span[2]/text()")[0]
                    comm.area = region_name
                    comm.insert_db()

                    detail_url = "http://www.zstmsf.com" + co_url
                    self.bu_parse(detail_url, comm.co_id)
Exemplo n.º 17
0
 def get_comm_info(self, all_url_list):
     try:
         c = Comm(co_index)
         c.co_name = "class='newtopleft font-k'>(.*?)</li>"
         c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"'
         c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>"
         c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>"
         c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<"
         data_list = c.to_dict()
         p = ProducerListUrl(page_url=all_url_list,
                             request_type='get',
                             encode='utf-8',
                             analyzer_rules_dict=data_list,
                             analyzer_type='regex',
                             headers=self.headers)
         p.get_details()
         global count
         count += 1
         print(count)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
Exemplo n.º 18
0
    def start_crawler(self):
        res = requests.get(self.start_url, headers=self.headers)
        html = etree.HTML(res.text)
        comm_url_list = html.xpath("//div[@class='post']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.ggsfcw.com/' + comm_url
                comm_res = requests.get(url, headers=self.headers)
                com_html = etree.HTML(comm_res.text)
                comm = Comm(co_index)
                comm.co_name = re.search('<h3.*?">(.*?)</',
                                         comm_res.text).group(1)
                comm.co_id = re.search('n=(\d+)', comm_res.text).group(1)
                comm.co_address = re.search('地址.*?">(.*?)</',
                                            comm_res.text).group(1)
                comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1)
                comm.co_develops = re.search('开发商.*?">(.*?)</',
                                             comm_res.text).group(1)
                comm.co_use = re.search('规划用途.*?">(.*?)</',
                                        comm_res.text).group(1)
                comm.insert_db()
            except Exception as e:
                log.error("小区信息错误", e)
                continue

            bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a")
            self.build_info(bu_list, comm.co_id)
Exemplo n.º 19
0
 def get_comm_info(self, comm_id_list):
     for i in comm_id_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_id = i
             comm.insert_db()
             bu_html = re.search(
                 '<table class="table table-bordered itemInfoDetail.*?</table>',
                 html, re.S | re.M).group()
             build_info_list = re.findall('<tr>.*?</tr>', bu_html,
                                          re.S | re.M)[1:]
             for i in build_info_list:
                 try:
                     build = Building(co_index)
                     build.bu_num = re.search('<td>(.*?)<', i,
                                              re.S | re.M).group(1)
                     build.bu_all_house = re.search(
                         '<td>.*?<td>.*?<td>(.*?)<', i,
                         re.S | re.M).group(1)
                     build.bu_id = re.search('buildId=(.*?)&', i,
                                             re.S | re.M).group(1)
                     build.co_id = comm.co_id
                     build.insert_db()
                     house_url = re.search('<a href="(.*?)"', bu_html,
                                           re.S | re.M).group(1)
                     response = requests.get(house_url,
                                             headers=self.headers)
                     html = response.text
                     house_url_list = re.findall(
                         '<td width="110">.*?<a.*?href="(.*?)"', html,
                         re.S | re.M)
                     self.get_house_info(house_url_list, build.bu_id,
                                         comm.co_id)
                 except Exception as e:
                     print(
                         '楼栋错误,co_index={},url={}'.format(
                             co_index, house_url), e)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Exemplo n.º 20
0
    def comm_info(self, comm_url_list):
        for temp in comm_url_list:
            comm_url = "http://www.njhouse.com.cn/2016/spf/" + temp
            try:
                co = Proxy_contact(app_name="nanjing",
                                   method='get',
                                   url=comm_url,
                                   headers=self.headers)
                co_res = co.contact()
            except Exception as e:
                log.error("小区页面访问失败{}".format(e))
                continue
            con = co_res.decode('gbk')
            comm = Comm(co_index)
            comm.co_id = re.search('prjid=(\d+)" ta', con).group(1)
            comm.co_name = re.search('<h2>(.*?)<em>', con).group(1)
            comm.area = re.search("\[.*?'>(.*?)</a>\]", con).group(1)
            comm.co_develops = re.search('开发企业</td>.*?">(.*?)</a', con,
                                         re.S | re.M).group(1)
            comm.co_address = re.search('项目地址.*?<td>(.*?)</td', con,
                                        re.S | re.M).group(1)
            comm.co_open_time = re.search('开盘时间.*?<td>(.*?)</td', con,
                                          re.S | re.M).group(1)
            comm.co_use = re.search('用途.*?<td>(.*?)</td', con,
                                    re.S | re.M).group(1)
            comm.co_pre_sale = re.findall("'_blank'>(\d+)</a>", con)
            # comm.co_land_use = re.search('土地使用.*?span>(.*?)</span',con,re.S|re.M).group(1)
            comm.co_plan_project = re.search('工程规划.*?span>(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_plan_useland = re.search('用地规划.*?span>(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_work_pro = re.search('施工.*?span>(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_all_house = re.search('入网总套数.*?">(.*?)</td', con,
                                          re.S | re.M).group(1)
            comm.co_all_size = re.search('入网总面积.*?td>(.*?)m', con,
                                         re.S | re.M).group(1)
            comm.insert_db()

            build_temp = "http://www.njhouse.com.cn/2016/spf/sales.php?prjid=" + str(
                comm.co_id)
            while True:
                try:
                    build_proxy = Proxy_contact(app_name="nanjing",
                                                method='get',
                                                url=build_temp,
                                                headers=self.headers)
                    build_temp_con = build_proxy.contact()
                    build_temp_con = build_temp_con.decode('gbk')
                    html = etree.HTML(build_temp_con)
                    break
                except:
                    continue
            build_url_list = html.xpath("//div[@class='fdxs_left']/a/@href")
            self.build_info(build_url_list, comm.co_id)
Exemplo n.º 21
0
 def get_comm_detail(self, detail_url, area):
     try:
         comm = Comm(co_index)
         comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html,
                                     re.S | re.M).group(1)
         comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html,
                                        re.S | re.M).group(1)
         comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<',
                                           html, re.S | re.M).group(1)
         comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_id = re.search('FD=(.*?)&', detail_url,
                                re.S | re.M).group(1)
         comm.area = area
         comm.insert_db()
         build_html = re.search('id="donglist".*?</table>', html,
                                re.S | re.M).group()
         build_info_list = re.findall('<tr.*?</tr>', build_html,
                                      re.S | re.M)
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = comm.co_id
             build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i,
                                      re.S | re.M).group(1)
             build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                                        i, re.S | re.M).group(1)
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             build.bu_id = re.search("LID=(.*?)$", house_url,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(house_url, comm.co_id, build.bu_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url),
               e)
Exemplo n.º 22
0
    def get_comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            comm_detail = "http://xx.yyfdcw.com" + comm_url
            try:
                comm_res = requests.get(comm_detail, headers=self.headers)
            except Exception as e:
                print("co_index={},小区详情页无法访问".format(co_index), e)
                continue
            con = comm_res.text
            comm = Comm(co_index)
            comm.co_id = re.search('ID=(\d+)', con).group(1)
            comm.co_name = re.search('lpname">.*?<h2>(.*?)</h2', con,
                                     re.S | re.M).group(1)
            comm.co_develops = re.search('开发商:.*?Kfs">(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_green = re.search('绿化率:.*?Lhl">(.*?)</span', con,
                                      re.S | re.M).group(1)
            comm.area = re.search('区域:.*?Name">(.*?)</span', con,
                                  re.S | re.M).group(1)
            comm.co_address = re.search('位置:</b>(.*?)</li', con,
                                        re.S | re.M).group(1)
            comm.co_build_size = re.search('建筑面积:.*?l5">(.*?)</span', con,
                                           re.S | re.M).group(1)
            comm.co_all_house = re.search('总户数:.*?hs">(.*?)</span', con,
                                          re.S | re.M).group(1)
            comm.co_plan_useland = re.search('用地.*?l4">(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_plan_project = re.search('工程.*?l3">(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_build_type = re.search('楼盘类型.*?Type">(.*?)</span', con,
                                           re.S | re.M).group(1)
            comm.co_all_size = re.search('占地面积.*?mianji">(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_land_use = re.search('使用权证.*?l1">(.*?)</span', con,
                                         re.S | re.M).group(1)

            comm.insert_db()
            try:
                build_list = re.findall(
                    '<td align="center">.*?<a href="(.*?)"', con, re.S | re.M)
                if len(build_list) > 0:
                    self.get_build_info(build_list, comm.co_id)
                else:
                    print("co_index={},小区co_id={}没有楼栋".format(
                        co_index, comm.co_id))
                    continue
            except:
                print("co_index={},小区co_id={}没有楼栋".format(
                    co_index, comm.co_id))
                continue
Exemplo n.º 23
0
 def comm_parse(self, url_list):  # 小区信息解析
     co = Comm(co_index)
     # url_list = Queue()
     while True:
         url, area, type = url_list.get()
         try:
             res = requests.get(url, headers=self.headers)
         except Exception as e:
             print("co_index={},小区详情页无法访问".format(co_index), e)
             continue
         con = res.text
         co.area = area
         co.co_type = type
         co.co_id = re.search('id=(\d+)', url).group(1)
         co.co_develops = re.search('企业名称.*?>&nbsp;(.*?)<', con,
                                    re.S | re.M).group(1)
         co.co_name = re.search('项目名称.*?>&nbsp;(.*?)<', con,
                                re.S | re.M).group(1)
         co.co_address = re.search('项目座落.*?>&nbsp;(.*?)<', con,
                                   re.S | re.M).group(1)
         co.co_use = re.search('房屋用途.*?>&nbsp;(.*?)<', con,
                               re.S | re.M).group(1)
         try:
             co.co_pre_sale = re.search('许可证号.*?>&nbsp;(.*?)<', con,
                                        re.S | re.M).group(1)
         except:
             co.co_pre_sale = None
         new_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/p/ProjInfo.do?propid=" + co.co_id
         a_res = requests.get(new_url, headers=self.headers)
         a_con = a_res.text
         co.co_build_size = re.search('建筑面积.*?>&nbsp;(.*?)<', a_con,
                                      re.S | re.M).group(1)
         co.co_all_house = re.search('销售套数.*?>&nbsp;(.*?)<', a_con,
                                     re.S | re.M).group(1)
         co.co_green = re.search('绿化率.*?>&nbsp;(.*?)<', a_con,
                                 re.S | re.M).group(1)
         co.co_build_start_time = re.search('开工日期.*?>&nbsp;(.*?)<', a_con,
                                            re.S | re.M).group(1)
         co.co_build_end_time = re.search('竣工日期.*?>&nbsp;(.*?)<', a_con,
                                          re.S | re.M).group(1)
         co.co_volumetric = re.search('容积率.*?>&nbsp;(.*?)<', a_con,
                                      re.S | re.M).group(1)
         co.insert_db()
         global count
         count += 1
         print(count)
         try:
             self.build_parse(co.co_id, )
         except Exception as e:
             print("co_index={},楼栋信息错误".format(co_index), e)
Exemplo n.º 24
0
 def get_comm_info(self, comm_url, co_id):
     comm = Comm(co_index)
     response = requests.get(comm_url, headers=self.headers)
     html = response.text
     comm.co_name = re.search('项目名称:.*?class="left">(.*?)</td>', html, re.S | re.M).group(1)
     comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_build_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.area = re.search('所属片区:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_size = re.search('占地面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
     comm.co_id = co_id
     comm.insert_db()
Exemplo n.º 25
0
 def comm(self, tag):
     co = Comm(co_index)
     co.co_name = tag.xpath("./td[@width='143']/a/text()")[0]
     co.area = tag.xpath("./td[@width='184']/text()")[0]
     co.co_develops = tag.xpath("./td[@width='192']/text()")[0]
     co_id = tag.xpath("./td/a/@href")[0]
     co.co_id = re.search('mmcid=(\d+)&', co_id).group(1)
     co.co_open_time = tag.xpath("./td[@width='95']/text()")[0]
     buid_all_url = "http://www.syfc.com.cn" + co_id
     co.insert_db()
     global count
     count += 1
     print(count)
     return buid_all_url, co.co_id
Exemplo n.º 26
0
 def get_comm_info(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/ProjectInfo.aspx?code=' + str(
                 i)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('id="PROJECT_XMMC1">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('id="PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('id="PROJECT_KFQY_NAME1">(.*?)<',
                                          html, re.S | re.M).group(1)
             comm.area = re.search('id="PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_build_size = re.search('id="PROJECT_GHZJZMJ">(.*?)<',
                                            html, re.S | re.M).group(1)
             comm.co_volumetric = re.search('id="PROJECT_RJL">(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_build_start_time = re.search(
                 'id="PROJECT_JHKGRQ">(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_end_time = re.search(
                 'id="PROJECT_JHJGRQ">(.*?)<', html, re.S | re.M).group(1)
             house_all = re.search('id="lbYsZZTs">(.*?)<', html,
                                   re.S | re.M).group(1)
             house_all_a = re.search('id="lbWsZZTs">(.*?)<', html,
                                     re.S | re.M).group(1)
             bus_all = re.search('id="lbWsSYTs">(.*?)<', html,
                                 re.S | re.M).group(1)
             bus_all_a = re.search('id="lbYsSYTs">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_all_house = int(house_all_a) + int(house_all) + int(
                 bus_all) + int(bus_all_a)
             area_size_a = re.search('id="lbYsZZMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_b = re.search('id="lbWsZZMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_c = re.search('id="lbWsSYMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_d = re.search('id="lbYsSYMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             comm.co_size = float(area_size_a) + float(area_size_b) + float(
                 area_size_c) + float(area_size_d)
             comm.co_id = str(i)
             comm.insert_db()
             self.get_build_info(comm.co_id)
         except Exception as e:
             print('小区 错误,co_index={},url={}'.format(co_index, comm_url), e)
Exemplo n.º 27
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://old.newhouse.cnnbfdc.com/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html,
                                       re.S | re.M)[0].strip()
             comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0].strip()
             comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M)[0].strip()
             comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<',
                                           html, re.S | re.M)[0].strip()
             comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<',
                                             html, re.S | re.M)[0].replace(
                                                 'm&sup2;', '').strip()
             comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<',
                                            html, re.S | re.M)[0].replace(
                                                '套', '').strip()
             comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html,
                                    re.S | re.M)[0].strip()
             comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"',
                                     html, re.S | re.M)[0].strip()
             comm.insert_db()
             global count
             count += 1
             print(count)
             build_url_list = re.findall("window.open\('(.*?)'", html,
                                         re.S | re.M)
             bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<",
                                       html, re.S | re.M)
             bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<",
                                            html, re.S | re.M)
             qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M)
             for index in range(len(build_url_list)):
                 try:
                     build = Building(co_index)
                     build.bu_name = bu_name_list[index].strip()
                     build.bu_all_house = bu_all_house_list[index].strip()
                     build.co_id = comm.co_id
                     build.bu_id = qrykey[index].strip()
                     build.insert_db()
                 except Exception as e:
                     print(e)
             self.get_house_info(build_url_list)
         except Exception as e:
             print(e)
Exemplo n.º 28
0
    def get_comm_info(self, comm_url_list):
        for i in comm_url_list:
            comm = Comm(co_index)
            comm_url = 'http://old.newhouse.cnnbfdc.com/' + i
            try:
                response = requests.get(comm_url, headers=self.headers)
            except Exception as e:
                print("{}城市无法访问小区{}".format(city, comm_url), e)
                continue

            html = response.text
            con = etree.HTML(html)
            comm.co_id = re.search('id=(\d+)', i).group(1)
            comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html,
                                      re.S | re.M)[0]
            comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M)[0]
            comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0]
            comm.co_pre_sale = re.findall('售证名称:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0]
            comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html,
                                            re.S | re.M)[0]
            comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html,
                                           re.S | re.M)[0]
            comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html,
                                   re.S | re.M)[0]
            comm.insert_db()
            bu_all_house_list = re.findall(
                'window.open.*?center.*?center.*?>(.*?)<', html, re.S | re.M)
            try:
                bu_url_list = re.findall("window\.open\('(.*?)'", html,
                                         re.S | re.M)
            except Exception as e:
                print("{}城市{}小区无楼栋".format(city, comm.co_name), e)
                continue
            for i in range(len(bu_url_list)):
                build = Building(co_index)
                bu_url = bu_url_list[i]
                build.bu_all_house = bu_all_house_list[i]
                build.co_name = comm.co_name
                build.bu_num = con.xpath("//a[@href='#']/@title")[i]
                build.bu_id = re.search('key=(\d+)&', bu_url).group(1)
                build.co_id = comm.co_id
                build.insert_db()
                self.get_house_info(bu_url, build.bu_id)
Exemplo n.º 29
0
 def get_comm_info(self, co_url_list):
     for i in co_url_list:
         comm = Comm(co_index)
         comm_url = 'http://183.63.60.194:8808/public/web/ysxm?ysxmid=' + i
         try:
             time.sleep(1)
             response = self.s.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_id = re.search('ysxmid=(.*?)$', comm_url).group(1)
             comm.co_develops = re.findall('kfsmc.*?<a.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.co_name = re.findall('PresellName.*?<a.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_address = re.findall('ItemRepose.*?>(.*?)<', html,
                                          re.S | re.M)[0]
             comm.co_build_size = re.findall('PresellArea.*?>(.*?)<', html,
                                             re.S | re.M)[0]
             comm.co_all_house = re.findall('djrqtd.*?>(.*?)<', html,
                                            re.S | re.M)[0]
             comm.co_land_use = re.findall('landinfo.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.co_type = re.findall('zczjtd.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.area = re.findall('FQ.*?>(.*?)<', html, re.S | re.M)[0]
             comm.co_pre_sale_date = re.findall('FZDatebegin.*?>(.*?)<',
                                                html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('bookid.*?<a.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.insert_db()
             bu_address_list = re.findall(
                 'onmouseout.*?center.*?center">(.*?)<', html, re.S | re.M)
             bu_num_list = re.findall(
                 'onmouseout.*?center.*?center.*?center">(.*?)<', html,
                 re.S | re.M)
             bu_floor_list = re.findall(
                 'onmouseout.*?center.*?center.*?center.*?center">(.*?)<',
                 html, re.S | re.M)
             bu_url_list = re.findall('onmouseout.*?href="(.*?)"', html,
                                      re.S | re.M)
             self.get_build_info(bu_address_list, bu_num_list,
                                 bu_floor_list, bu_url_list, comm.co_id)
             global count
             count += 1
             print(count)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Exemplo n.º 30
0
    def comm_info(self, co_id):
        comm_url = "http://www.lsjs.gov.cn/WebLSZFGB/LPDetail.aspx?RowGuid=" + co_id
        co_res = requests.get(comm_url, headers=self.headers)
        con = co_res.text
        co = Comm(co_index)
        co.co_name = re.search('楼 盘 名 称:(.*?)<br', con).group(1)
        co.co_id = co_id
        co.area = re.search('所 属 城 区:.*?">(.*?)</span', con).group(1)
        co.co_address = re.search('楼 盘 坐 落:.*?">(.*?)</span', con).group(1)
        co.co_develops = re.search('项 目 公 司:.*?mc">(.*?)</span', con,
                                   re.S | re.M).group(1)
        co.co_pre_sale = re.search('预销售证号.*?">(.*?)</span', con,
                                   re.S | re.M).group(1)
        co.co_all_house = re.search('预售总套数.*?td>(.*?)</td', con,
                                    re.S | re.M).group(1)
        co.co_all_size = re.search('预售总面积.*?td>(.*?)</td', con,
                                   re.S | re.M).group(1)
        co.co_pre_sale_date = re.search('时间.*?">(.*?)</span', con,
                                        re.S | re.M).group(1)
        co.insert_db()

        url = 'http://www.lsjs.gov.cn/WebLSZFGB/Ashx/YSXM.ashx'
        count = 1
        while True:
            data = {
                "method": "getzxl",
                "PageSize": 5,
                "CurrentPageIndex": str(count),
                "YSXMID": co_id,
                # 'Searchkey':''
            }
            res = requests.post(url, data=data, headers=self.headers)
            con_dict = json.loads(res.text)
            num = con_dict["data"][0]['TotalNum']
            info_list = con_dict["data"][1:]
            for info in info_list:
                bu_id = info["YSZID"]
                self.build_info(co_id, bu_id)
            if int(num) < count * 5:
                break
            else:
                count += 1
                continue