Пример #1
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             code = i.split(',')
             comm_url = 'http://www.tmsf.com/newhouse/property_' + code[
                 0] + '_' + code[1] + '_info.htm'
             comm = Comm(co_index)
             comm.co_name = 'buidname.*?>(.*?)<'
             comm.co_address = '--位置行--.*?<span.*?title="(.*?)"'
             comm.co_build_type = '建筑形式:<.*?>(.*?)<'
             comm.co_develops = '项目公司:<.*?>(.*?)<'
             comm.co_volumetric = '容 积 率:</span>(.*?)<'
             comm.co_green = '绿 化 率:</span>(.*?)<'
             comm.co_size = '占地面积:</span>(.*?)<'
             comm.co_build_size = '总建筑面积:</span>(.*?)<'
             comm.co_all_house = '总户数:</span>(.*?)<'
             comm.co_id = 'info" href="/newhouse/property_(.*?)_info'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 current_url_rule='一房一价<.*?href="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             build_all_url = p.get_details()
             global count
             count += 1
             print('comm:', count)
             self.get_build_info(build_all_url)
         except Exception as e:
             print('小区页面,co_index={},url={}'.format(co_index, comm_url), e)
Пример #2
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhProjectInfo.jsp'
             data = {'projectID': i}
             response = requests.post(url=comm_url,
                                      data=data,
                                      headers=self.headers)
             html = response.text
             comm.co_id = i
             comm.co_name = re.findall('bszn_title">(.*?)<', html,
                                       re.S | re.M)[0].strip()
             comm.area = re.findall('所在区县:.*?<span>(.*?)<', html,
                                    re.S | re.M)[0].strip()
             comm.co_address = re.findall('项目地址:.*?<span>(.*?)<', html,
                                          re.S | re.M)[0].strip()
             comm.co_develops = re.findall('企业名称:.*?<a.*?>(.*?)<', html,
                                           re.S | re.M)[0].strip()
             comm.co_all_house = re.findall(
                 '<td>总套数.*?<td class="xxxx_list3">(.*?)<', html,
                 re.S | re.M)[0].strip()
             comm.co_build_size = re.findall(
                 '<td>总面积.*?<td class="xxxx_list3">(.*?)<', html,
                 re.S | re.M)[0].strip()
             comm.insert_db()
             build_logo_list = re.findall(
                 'javascript:getBuilingList\("(.*?)"', html, re.S | re.M)
             self.get_build_info(build_logo_list, i)
         except Exception as e:
             print('青岛小区问题,url post data is:={}'.format(data), e)
Пример #3
0
 def start_crawler(self):
     for i in range(1, 478):
         data = {
             "method": "GetYszData",
             "page": str(i),
             "ysxkz": '',
             "kfs": '',
             "lpmc": ''
         }
         res = requests.post(self.start_url,
                             headers=self.headers,
                             data=data)
         info = res.json()
         comm = json.loads(info)
         for detail in comm['Rows']:
             co = Comm(co_index)
             co.co_name = detail['PRJNAME']
             co.co_pre_sale = detail['PRENUM']
             co.area = detail['CZAREA']
             co.co_pre_sale_date = detail['PresaleCertificateDate']
             co.co_address = detail['BSIT']
             co.co_develops = detail['NAME']
             co.co_build_size = detail['YSROOMBAREA']
             co.co_all_house = detail['YSROOMNUMS']
             co.insert_db()
Пример #4
0
 def get_comm_info(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.ytfcjy.com/public/project/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0]
             comm.co_id = re.findall('ProjectInfo.aspx\?code=(.*?)&', html, re.S | re.M)[0]
             comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0]
             comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0]
             comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0]
             comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0]
             comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0]
             comm.co_all_house = re.findall('YSZTS">(.*?)<', html, re.S | re.M)[0]
             comm.co_plan_pro = re.findall('id="ghxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.co_work_pro = re.findall('id="sgxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.co_land_use = re.findall('id="tdzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.insert_db()
             global count
             count += 1
             print(count)
             build_url_list = re.findall('id="buildInfo" value="(.*?)"', html, re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print(e)
Пример #5
0
    def comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            try:
                co_url = 'http://222.77.178.63:7002/' + comm_url
                co_res = requests.get(co_url, headers=self.headers)
                con = co_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('projectID=(.*)', comm_url).group(1)
                co.co_name = re.search('项目名称:.*?">(.*?)</', con,
                                       re.S | re.M).group(1)
                co.area = re.search('所在区县:.*?">(.*?)</', con,
                                    re.S | re.M).group(1)
                co.co_address = re.search('项目地址:.*?">(.*?)</', con,
                                          re.S | re.M).group(1)
                co.co_develops = re.search('企业名称:.*?blank">(.*?)</', con,
                                           re.S | re.M).group(1)
                co.co_all_house = re.search('>总套数.*?">(\d+)<', con,
                                            re.S | re.M).group(1)
                co.co_all_size = re.search('>总面积.*?">(.*?)<', con,
                                           re.S | re.M).group(1)
                project_name = parse.quote(co.co_name)
                co.insert_db()
            except Exception as e:
                # log.error('小区信息错误{}'.format(e))
                print('小区信息错误{}'.format(e))

            sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" + co.co_id + "&projectname=" + project_name
            res = requests.get(sale_url, headers=self.headers)
            html = etree.HTML(res.content.decode('gbk'))
            temp_url_list = html.xpath("//a/@href")
            self.build_info(co.co_id, temp_url_list)
Пример #6
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.gzbjfc.com/' + i
             comm.co_name = 'cph_hif1_xmmc.*?<.*?>(.*?)<'
             comm.co_pre_sale = 'cph_hif1_xsxkz.*?<.*?>(.*?)<'
             comm.co_address = 'cph_hif1_zl.*?<.*?>(.*?)<'
             comm.co_develops = 'cph_hif1_kfs.*?<.*?>(.*?)<'
             comm.co_handed_time = 'cph_hif1_jfsj.*?<.*?>(.*?)<'
             comm.co_build_size = 'cph_hif1_jzmj.*?>(.*?)<'
             comm.co_all_house = 'cph_hif1_fwts.*?>(.*?)<'
             comm.co_id = 'hdl1_hfYszh" value="(.*?)"'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
             # 楼栋信息
             build_url = comm_url.replace('Info', 'Building')
             self.get_build_info(build_url)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #7
0
 def start_crawler(self):
     for i in range(1, 10000):
         formdata = {
             "currentpage": i,
             "pagesize": 20,
         }
         try:
             res = requests.post(
                 "http://fdc.xmtfj.gov.cn:8001/home/Getzslp",
                 data=formdata,
                 headers=self.headers)
             con = json.loads(res.text)
             body = con['Body']
             info_dict = json.loads(body)['bodylist']
             for i in info_dict:
                 comm = Comm(co_index)
                 comm.co_name = i['XMMC']
                 comm.co_id = i['TRANSACTION_ID']
                 comm.co_address = i['XMDZ']
                 comm.co_pre_sale = i['YSXKZH']
                 comm.co_all_house = i['PZTS']
                 comm.co_build_size = i['PZMJ']
                 comm.co_area = i['XMDQ']
                 comm.co_pre_date = i['GETDATE']
                 comm.insert_db()
         except Exception as e:
             print(
                 '小区错误,co_index={},url={},data'.format(
                     co_index, 'http://fdc.xmtfj.gov.cn:8001/home/Getzslp',
                     formdata), e)
Пример #8
0
    def start_crawler(self):
        data = {"Submit": "(unable to decode value)"}
        res = requests.post(self.start_url, data=data, headers=self.headers)
        html = etree.HTML(res.content.decode('gbk'))
        comm_url_list = html.xpath(
            "//tr//span[@style='width:270px; color:#006']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.fxfdcw.com/' + comm_url
                com_res = requests.get(url, headers=self.headers)
                con = com_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('xmid=(\d+)', comm_url).group(1)
                co.co_name = re.search('项目名称.*?">(.*?)</', con,
                                       re.S | re.M).group(1)
                co.co_develops = re.search('开发企业:(.*?) &nbsp', con,
                                           re.S | re.M).group(1)
                co.co_address = re.search('项目地址.*?">(.*?)</', con,
                                          re.S | re.M).group(1)
                co.co_build_size = re.search('建筑面积.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                co.co_all_house = re.search('总套数.*?">(.*?)</', con,
                                            re.S | re.M).group(1)
                co.insert_db()

                bu_list = re.findall("window.open\('(.*?)'\)", con,
                                     re.S | re.M)
            except Exception as e:
                # log.error("小区信息错误{}".format(e))
                print("小区信息错误{}".format(e))
                continue

            self.bu_info(bu_list, co.co_id)
Пример #9
0
 def get_comm_info(self, comm_url):
     comm = Comm(co_index)
     comm_url = comm_url.replace('buildingdetail', 'buildinfo')
     response = self.request_proxy(comm_url, headers=self.headers)
     html = response.content.decode('gbk')
     comm.co_name = re.search('class="sf_xq_xmmc">(.*?)<', html,
                              re.S | re.M).group(1).strip()
     comm.area = re.search('id="Label_CityArea">(.*?)<', html,
                           re.S | re.M).group(1).strip()
     comm.co_pre_sale_date = re.search('class="sf_xq_jfsj">(.*?)<', html,
                                       re.S | re.M).group(1).strip()
     comm.co_build_type = re.search('id="lbl_JZJG".*?>(.*?)<', html,
                                    re.S | re.M).group(1).strip()
     comm.co_address = re.search('id="Label_ProjectAdress">(.*?)<', html,
                                 re.S | re.M).group(1).strip()
     comm.co_pre_sale = re.search('id="Label_SallPreDocuments">(.*?)<',
                                  html, re.S | re.M).group(1).strip()
     comm.co_all_house = re.search('id="lbl_ZTS".*?>(.*?)<', html,
                                   re.S | re.M).group(1).strip()
     comm.co_build_size = re.search('id="lbl_JZMJ".*?>(.*?)<', html,
                                    re.S | re.M).group(1).strip()
     comm.co_all_size = re.search('id="lbl_ZDMJ".*?>(.*?)<', html,
                                  re.S | re.M).group(1).strip()
     comm.co_develops = re.search('id="Label_DevName">.*?>(.*?)<', html,
                                  re.S | re.M).group(1).strip()
     comm.co_id = re.search('action=.*?buildingid=(.*?)"', html,
                            re.S | re.M).group(1).strip()
     comm.insert_db()
     buildingid = re.search('buildingid=(.*?)$', comm_url,
                            re.S | re.M).group(1)
     self.get_build_info(buildingid, comm.co_id)
Пример #10
0
    def comm_info(self, comm_url_list):  # 小区信息
        co = Comm(co_index)
        build_url_list = []
        for comm_url in comm_url_list:
            co.co_id = re.search('id=(\d+)', comm_url).group(1)
            detail_url = "http://ris.szpl.gov.cn/bol/" + comm_url.lstrip(".")
            url = "http://ris.szpl.gov.cn/bolprojectdetail.aspx?id=" + str(co.co_id)
            try:
                res = requests.get(detail_url, headers=self.headers)
                con = res.text

                co.co_pre_sale = re.search('许可证号.*?">(.*?)&', con).group(1)
                co.co_name = re.search('项目名称.*?">(.*?)&', con).group(1)
                co.co_address = re.search('所在位置.*?">(.*?)&', con).group(1)
                co.co_develops = re.search('发展商.*?">(.*?)&', con).group(1)
                co_type = re.search('住宅.*?面积.*?">(.*?)平方米.*?套数.*?">(.*?)&', con)
                co.co_build_size = co_type.group(1)
                co.co_all_house = co_type.group(2)
                co.insert_db()

                response = requests.get(url, headers=self.headers)
                content = etree.HTML(response.text)
                build_url = content.xpath("//td/a/@href")
                build_url_list.extend(build_url)
            except:
                continue
        self.build_info(build_url_list)
Пример #11
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         comm_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             comm = Comm(co_index)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1)
             comm.insert_db()
             build_url_list = re.findall('href="(/House/BuildingInfo\?buildingInfoID=.*?&amp;caseID=.*?)"', html,
                                         re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #12
0
 def get_comm_info(self, all_url_list):
     try:
         c = Comm(co_index)
         c.co_name = "class='newtopleft font-k'>(.*?)</li>"
         c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"'
         c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>"
         c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>"
         c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<"
         data_list = c.to_dict()
         p = ProducerListUrl(page_url=all_url_list,
                             request_type='get',
                             encode='utf-8',
                             analyzer_rules_dict=data_list,
                             analyzer_type='regex',
                             headers=self.headers)
         p.get_details()
         global count
         count += 1
         print(count)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
Пример #13
0
 def get_comm_info(self, comm_id_list):
     for i in comm_id_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_id = i
             comm.insert_db()
             bu_html = re.search(
                 '<table class="table table-bordered itemInfoDetail.*?</table>',
                 html, re.S | re.M).group()
             build_info_list = re.findall('<tr>.*?</tr>', bu_html,
                                          re.S | re.M)[1:]
             for i in build_info_list:
                 try:
                     build = Building(co_index)
                     build.bu_num = re.search('<td>(.*?)<', i,
                                              re.S | re.M).group(1)
                     build.bu_all_house = re.search(
                         '<td>.*?<td>.*?<td>(.*?)<', i,
                         re.S | re.M).group(1)
                     build.bu_id = re.search('buildId=(.*?)&', i,
                                             re.S | re.M).group(1)
                     build.co_id = comm.co_id
                     build.insert_db()
                     house_url = re.search('<a href="(.*?)"', bu_html,
                                           re.S | re.M).group(1)
                     response = requests.get(house_url,
                                             headers=self.headers)
                     html = response.text
                     house_url_list = re.findall(
                         '<td width="110">.*?<a.*?href="(.*?)"', html,
                         re.S | re.M)
                     self.get_house_info(house_url_list, build.bu_id,
                                         comm.co_id)
                 except Exception as e:
                     print(
                         '楼栋错误,co_index={},url={}'.format(
                             co_index, house_url), e)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #14
0
 def parse(self, co_list):
     for project in co_list[2:]:
         try:
             co = Comm(co_index)
             co.co_pre_sale_date = project.xpath("./td[1]/font/text()")[0]
             co.co_develops = project.xpath("./td[2]/font/text()")[0]
             co.co_pre_sale = project.xpath("./td[3]/font/text()")[0]
             co.co_name = project.xpath("./td[4]/font/text()")[0]
             co.co_address = project.xpath("./td[5]/font/text()")[0]
             co.co_use = project.xpath("./td[8]/font/text()")[0]
             try:
                 co.co_all_house = project.xpath("./td[11]/font/text()")[0]
             except:
                 log.info("无总套数")
                 co.co_all_house = None
             co.co_build_size = project.xpath("./td[10]/font/text()")[0]
             co.insert_db()
         except Exception as e:
             log.error("{}小区解析失败".format(project))
Пример #15
0
    def analyzer_comm_url(self, comm_url_list):
        all_url = []
        for i in comm_url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                c = Comm(self.co_index)
                c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>',
                                      html, re.S | re.M).group(1)  # 项目名称
                c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>',
                                         html, re.S | re.M).group(1)  # 项目地址
                c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>',
                                          html, re.S | re.M).group(1)  # 开发商
                c.co_build_size = re.search(
                    '总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 建筑面积
                c.co_land_type = re.search(
                    '用地依据:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 土地使用证
                c.co_all_house = re.search(
                    '>总套数:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 总套数
                c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html,
                                   re.S | re.M).group(1)  # 地区 area
                c.co_work_pro = re.search(
                    '施工许可证:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 施工许可证
                c.co_plan_pro = re.search(
                    '建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 规划许可证
                c.insert_db()

                buildlist = re.findall('onmouseover.*?</TR>', html,
                                       re.S | re.M)
                url_list = []
                for k in buildlist:
                    try:
                        b = Building(self.co_index)
                        build_list = re.findall('<TD.*?>(.*?)</TD>', k,
                                                re.S | re.M)
                        b.co_name = build_list[1]
                        b.bu_num = build_list[2]
                        b.bu_type = build_list[4]
                        b.insert_db()
                        house_url = re.findall('href="(.*?)"', k, re.S | re.M)
                        for j in house_url:
                            url_list.append(
                                'http://www.stfcj.gov.cn/stsite/ProjectList/' +
                                j)
                    except Exception as e:
                        print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
                all_url = all_url + url_list
            except Exception as e:
                print('小区错误,co_index={},url={}'.format(co_index, i), e)
        return all_url
Пример #16
0
    def comm_parse(self,co_name,co_addr,co_area,co_url):
        co_res = requests.get(co_url,headers=self.headers)
        co_res.encoding = 'gbk'
        con = co_res.text
        co = Comm(co_index)
        if re.search('开发商名称.*?;">(.*?)</',con,re.S|re.M):
            co.co_develops = re.search('开发商名称.*?;">(.*?)</',con,re.S|re.M).group(1)
        else:
            co.co_develops = None

        kfsid = re.search('kfsid=(\d+)',co_url).group(1)
        co.co_id = co_name+kfsid
        co.co_name = co_name
        co.co_address = co_addr
        co.area = co_area
        co.co_all_house = re.search('总套数.*?">(\d+)&nbsp',con,re.S|re.M).group(1)
        co.co_all_size = re.search('总面积.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.co_residential_size = re.search('住宅面积.*?">(.*?)&nbsp',con,re.S|re.M).group(1)
        co.insert_db()
        num = 1
        while True:
            pre_url = co_url + "&ypage=" + str(num)    # 预售翻页
            pre_res = requests.get(pre_url,headers=self.headers)
            pre_con = pre_res.content.decode('gbk')
            pre_html = etree.HTML(pre_con)
            if pre_html.xpath("//table[@id='preselltable1']//tr[@bgcolor='white']"):
                pre_list = pre_html.xpath("//table[@id='preselltable1']//tr[@bgcolor='white']")
                num += 1
                for pre in pre_list:
                    bu_url = pre.xpath("./td[4]/a/@href")[0]
                    if 'user_Presell' in bu_url:
                        self.bu_parse(bu_url,co.co_id,co_url)
                    else:
                        continue
            else:
                break

        while True:
            sell_url = co_url + "&page=" + str(num)    # 现售翻页
            sell_res = requests.get(sell_url, headers=self.headers)
            sell_con = sell_res.content.decode('gbk')
            sell_html = etree.HTML(sell_con)
            if sell_html.xpath("//table[@id='selltable1']//tr[@bgcolor='white']"):
                sell_list = sell_html.xpath("//table[@id='selltable1']//tr[@bgcolor='white']")
                num += 1
                for sell in sell_list:
                    ho_url = sell.xpath("./td/a/@href")[0]
                    if 'user_sell' in ho_url:
                        bu_id = re.search('ID=(.*?)&',ho_url).group(1)
                        self.house_parse(ho_url,co.co_id,bu_id)
                    else:
                        continue
            else:
                break
Пример #17
0
 def get_comm_info(self, comm_info_list):
     for i in comm_info_list:
         try:
             comm = Comm(co_index)
             comm.co_name = re.search('<td>(.*?)</td>', i,
                                      re.S | re.M).group(1)
             comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i,
                                           re.S | re.M).group(1)
             comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i,
                                          re.S | re.M).group(1)
             comm.insert_db()
         except Exception as e:
             print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
Пример #18
0
 def get_comm_detail(self, detail_url, area):
     try:
         comm = Comm(co_index)
         comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html,
                                     re.S | re.M).group(1)
         comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html,
                                        re.S | re.M).group(1)
         comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<',
                                           html, re.S | re.M).group(1)
         comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_id = re.search('FD=(.*?)&', detail_url,
                                re.S | re.M).group(1)
         comm.area = area
         comm.insert_db()
         build_html = re.search('id="donglist".*?</table>', html,
                                re.S | re.M).group()
         build_info_list = re.findall('<tr.*?</tr>', build_html,
                                      re.S | re.M)
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = comm.co_id
             build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i,
                                      re.S | re.M).group(1)
             build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                                        i, re.S | re.M).group(1)
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             build.bu_id = re.search("LID=(.*?)$", house_url,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(house_url, comm.co_id, build.bu_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url),
               e)
Пример #19
0
    def get_comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            comm_detail = "http://xx.yyfdcw.com" + comm_url
            try:
                comm_res = requests.get(comm_detail, headers=self.headers)
            except Exception as e:
                print("co_index={},小区详情页无法访问".format(co_index), e)
                continue
            con = comm_res.text
            comm = Comm(co_index)
            comm.co_id = re.search('ID=(\d+)', con).group(1)
            comm.co_name = re.search('lpname">.*?<h2>(.*?)</h2', con,
                                     re.S | re.M).group(1)
            comm.co_develops = re.search('开发商:.*?Kfs">(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_green = re.search('绿化率:.*?Lhl">(.*?)</span', con,
                                      re.S | re.M).group(1)
            comm.area = re.search('区域:.*?Name">(.*?)</span', con,
                                  re.S | re.M).group(1)
            comm.co_address = re.search('位置:</b>(.*?)</li', con,
                                        re.S | re.M).group(1)
            comm.co_build_size = re.search('建筑面积:.*?l5">(.*?)</span', con,
                                           re.S | re.M).group(1)
            comm.co_all_house = re.search('总户数:.*?hs">(.*?)</span', con,
                                          re.S | re.M).group(1)
            comm.co_plan_useland = re.search('用地.*?l4">(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_plan_project = re.search('工程.*?l3">(.*?)</span', con,
                                             re.S | re.M).group(1)
            comm.co_build_type = re.search('楼盘类型.*?Type">(.*?)</span', con,
                                           re.S | re.M).group(1)
            comm.co_all_size = re.search('占地面积.*?mianji">(.*?)</span', con,
                                         re.S | re.M).group(1)
            comm.co_land_use = re.search('使用权证.*?l1">(.*?)</span', con,
                                         re.S | re.M).group(1)

            comm.insert_db()
            try:
                build_list = re.findall(
                    '<td align="center">.*?<a href="(.*?)"', con, re.S | re.M)
                if len(build_list) > 0:
                    self.get_build_info(build_list, comm.co_id)
                else:
                    print("co_index={},小区co_id={}没有楼栋".format(
                        co_index, comm.co_id))
                    continue
            except:
                print("co_index={},小区co_id={}没有楼栋".format(
                    co_index, comm.co_id))
                continue
Пример #20
0
 def comm_parse(self, url_list):  # 小区信息解析
     co = Comm(co_index)
     # url_list = Queue()
     while True:
         url, area, type = url_list.get()
         try:
             res = requests.get(url, headers=self.headers)
         except Exception as e:
             print("co_index={},小区详情页无法访问".format(co_index), e)
             continue
         con = res.text
         co.area = area
         co.co_type = type
         co.co_id = re.search('id=(\d+)', url).group(1)
         co.co_develops = re.search('企业名称.*?>&nbsp;(.*?)<', con,
                                    re.S | re.M).group(1)
         co.co_name = re.search('项目名称.*?>&nbsp;(.*?)<', con,
                                re.S | re.M).group(1)
         co.co_address = re.search('项目座落.*?>&nbsp;(.*?)<', con,
                                   re.S | re.M).group(1)
         co.co_use = re.search('房屋用途.*?>&nbsp;(.*?)<', con,
                               re.S | re.M).group(1)
         try:
             co.co_pre_sale = re.search('许可证号.*?>&nbsp;(.*?)<', con,
                                        re.S | re.M).group(1)
         except:
             co.co_pre_sale = None
         new_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/p/ProjInfo.do?propid=" + co.co_id
         a_res = requests.get(new_url, headers=self.headers)
         a_con = a_res.text
         co.co_build_size = re.search('建筑面积.*?>&nbsp;(.*?)<', a_con,
                                      re.S | re.M).group(1)
         co.co_all_house = re.search('销售套数.*?>&nbsp;(.*?)<', a_con,
                                     re.S | re.M).group(1)
         co.co_green = re.search('绿化率.*?>&nbsp;(.*?)<', a_con,
                                 re.S | re.M).group(1)
         co.co_build_start_time = re.search('开工日期.*?>&nbsp;(.*?)<', a_con,
                                            re.S | re.M).group(1)
         co.co_build_end_time = re.search('竣工日期.*?>&nbsp;(.*?)<', a_con,
                                          re.S | re.M).group(1)
         co.co_volumetric = re.search('容积率.*?>&nbsp;(.*?)<', a_con,
                                      re.S | re.M).group(1)
         co.insert_db()
         global count
         count += 1
         print(count)
         try:
             self.build_parse(co.co_id, )
         except Exception as e:
             print("co_index={},楼栋信息错误".format(co_index), e)
Пример #21
0
    def comm_info(self, url_list):
        for temp_url in url_list:
            try:
                comm = Comm(co_index)
                comm.co_id = re.search('Jh=(.*?\d+)', temp_url).group(1)
                parse_url = parse.quote(comm.co_id, encoding='gbk')
                comm_url = 'http://scxx.fgj.wuhan.gov.cn/3.asp?DengJh=' + parse_url
                proxy = Proxy_contact(app_name='wuhan',
                                      method='get',
                                      url=comm_url,
                                      headers=self.headers)
                res = proxy.contact()
                # res = requests.get(comm_url,headers=self.headers)
                con = res.decode('gb18030')
                # comm.co_id = re.search('Jh=(.*?)',temp_url).group(1)
                comm.co_name = re.search('项目名称.*?">(.*?)<', con,
                                         re.S | re.M).group(1)
                comm.co_all_house = re.search('套数.*?">(.*?)&nbsp', con,
                                              re.S | re.M).group(1)
                comm.co_address = re.search('坐落.*?">(.*?)</', con,
                                            re.S | re.M).group(1)
                comm.co_build_start_time = re.search('开工时间.*?">(.*?)</', con,
                                                     re.S | re.M).group(1)
                comm.co_build_end_time = re.search('竣工时间.*?">(.*?)</', con,
                                                   re.S | re.M).group(1)
                comm.co_size = re.search('用地面积.*?">(.*?)&nbsp', con,
                                         re.S | re.M).group(1)
                comm.co_build_size = re.search('建筑面积.*?">(.*?)&nbsp', con,
                                               re.S | re.M).group(1)
                comm.co_volumetric = re.search('容积率.*?">(.*?)</', con,
                                               re.S | re.M).group(1)
                comm.co_develops = re.search('开发企业</TD>.*?">(.*?)</TD', con,
                                             re.S | re.M).group(1)
                comm.co_land_use = re.search('土地使用证号.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                comm.co_plan_useland = re.search('用地规划许可证号.*?">(.*?)</', con,
                                                 re.S | re.M).group(1)
                comm.co_plan_project = re.search('工程规划许可证号.*?">(.*?)</', con,
                                                 re.S | re.M).group(1)
                comm.co_work_pro = re.search('施工许可证号.*?">(.*?)</', con,
                                             re.S | re.M).group(1)

                comm.insert_db()
                log.debug('{}插入成功'.format(comm.co_name))
            except Exception as e:
                log.error('小区错误{}'.format(e))
                continue
            build_detail = re.sub('3', '4', comm_url)
            self.build_info(build_detail, comm.co_id)
Пример #22
0
 def get_comm_info(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/ProjectInfo.aspx?code=' + str(
                 i)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('id="PROJECT_XMMC1">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('id="PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('id="PROJECT_KFQY_NAME1">(.*?)<',
                                          html, re.S | re.M).group(1)
             comm.area = re.search('id="PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_build_size = re.search('id="PROJECT_GHZJZMJ">(.*?)<',
                                            html, re.S | re.M).group(1)
             comm.co_volumetric = re.search('id="PROJECT_RJL">(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_build_start_time = re.search(
                 'id="PROJECT_JHKGRQ">(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_end_time = re.search(
                 'id="PROJECT_JHJGRQ">(.*?)<', html, re.S | re.M).group(1)
             house_all = re.search('id="lbYsZZTs">(.*?)<', html,
                                   re.S | re.M).group(1)
             house_all_a = re.search('id="lbWsZZTs">(.*?)<', html,
                                     re.S | re.M).group(1)
             bus_all = re.search('id="lbWsSYTs">(.*?)<', html,
                                 re.S | re.M).group(1)
             bus_all_a = re.search('id="lbYsSYTs">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_all_house = int(house_all_a) + int(house_all) + int(
                 bus_all) + int(bus_all_a)
             area_size_a = re.search('id="lbYsZZMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_b = re.search('id="lbWsZZMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_c = re.search('id="lbWsSYMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             area_size_d = re.search('id="lbYsSYMj">(.*?)<', html,
                                     re.S | re.M).group(1)
             comm.co_size = float(area_size_a) + float(area_size_b) + float(
                 area_size_c) + float(area_size_d)
             comm.co_id = str(i)
             comm.insert_db()
             self.get_build_info(comm.co_id)
         except Exception as e:
             print('小区 错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #23
0
 def get_comm_info(self, co_url_list):
     for i in co_url_list:
         comm = Comm(co_index)
         comm_url = 'http://183.63.60.194:8808/public/web/ysxm?ysxmid=' + i
         try:
             time.sleep(1)
             response = self.s.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_id = re.search('ysxmid=(.*?)$', comm_url).group(1)
             comm.co_develops = re.findall('kfsmc.*?<a.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.co_name = re.findall('PresellName.*?<a.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_address = re.findall('ItemRepose.*?>(.*?)<', html,
                                          re.S | re.M)[0]
             comm.co_build_size = re.findall('PresellArea.*?>(.*?)<', html,
                                             re.S | re.M)[0]
             comm.co_all_house = re.findall('djrqtd.*?>(.*?)<', html,
                                            re.S | re.M)[0]
             comm.co_land_use = re.findall('landinfo.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.co_type = re.findall('zczjtd.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.area = re.findall('FQ.*?>(.*?)<', html, re.S | re.M)[0]
             comm.co_pre_sale_date = re.findall('FZDatebegin.*?>(.*?)<',
                                                html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('bookid.*?<a.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.insert_db()
             bu_address_list = re.findall(
                 'onmouseout.*?center.*?center">(.*?)<', html, re.S | re.M)
             bu_num_list = re.findall(
                 'onmouseout.*?center.*?center.*?center">(.*?)<', html,
                 re.S | re.M)
             bu_floor_list = re.findall(
                 'onmouseout.*?center.*?center.*?center.*?center">(.*?)<',
                 html, re.S | re.M)
             bu_url_list = re.findall('onmouseout.*?href="(.*?)"', html,
                                      re.S | re.M)
             self.get_build_info(bu_address_list, bu_num_list,
                                 bu_floor_list, bu_url_list, comm.co_id)
             global count
             count += 1
             print(count)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #24
0
 def start_crawler(self):
     response = requests.get(url)
     html = response.text
     tree = etree.HTML(html)
     all_url = tree.xpath('//a[@class="a_name"]/@href')
     for i in all_url:
         comm = Comm(co_index)
         if i == '#':
             continue
         comm_url = 'http://www.lzfc.com.cn:8080' + i
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_address = "cc1.innerHTML='(.*?)'"
         comm.area = "cc2.innerHTML='(.*?)'"
         comm.co_use = "cc4.innerHTML='(.*?)'"
         comm.co_develops = "cc5.innerHTML='(.*?)'"
         comm.co_open_time = "cc6.innerHTML='(.*?)'"
         comm.co_all_house = "cc9.innerHTML='(.*?)'"
         comm.co_build_size = "cc11.innerHTML='(.*?)'"
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_id = "BaseCode=(.*?)'"
         p = ProducerListUrl(
             page_url=comm_url,
             request_type='get',
             encode='gbk',
             analyzer_rules_dict=comm.to_dict(),
             current_url_rule="queryBuildHerf1.href='(.*?)'",
             analyzer_type='regex')
         build_url = p.get_details()
         for i in build_url:
             build = Building(co_index)
             build_detail_url = 'http://www.lzfc.com.cn:8080' + i
             build.bu_num = 'onclick=comInfoView.*?center">(.*?)<'
             build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<'
             build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<'
             build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<'
             build.co_name = 'fontbg_red">(.*?)<'
             build.bu_id = "onclick=comInfoView\('(.*?)'\)"
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="queryBuildHerf1.href='(.*?)'",
                 analyzer_type='regex')
             build_url = p.get_details()
Пример #25
0
 def start_crawler(self):
     start_url = self.start_url + "searchSpf.jsp?nowPage=1"
     b = AllListUrl(
         first_page_url=start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='/(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         url = self.start_url + "searchSpf.jsp?nowPage=" + str(i)
         res = requests.get(url, headers=self.headers)
         html = etree.HTML(res.content.decode())
         url_list = html.xpath("//b/a/@href")
         for comm_temp in url_list:
             try:
                 comm_url = self.start_url + comm_temp.replace(
                     "./xmxxmainNew", 'xmxx/xmjbxx')
                 com_res = requests.get(comm_url, headers=self.headers)
                 con = com_res.content.decode('gbk')
                 co = Comm(co_index)
                 co.co_id = re.search('Id_xmxq=(.*)', comm_temp).group(1)
                 co.co_name = re.search('3a3a3a">(.*?)</b', con).group(1)
                 co.co_address = re.search('项目地址.*?">(.*?)</td', con,
                                           re.S | re.M).group(1)
                 co.co_develops = re.search('开 发 商.*?">(.*?)</td', con,
                                            re.S | re.M).group(1)
                 co.co_all_house = re.search('总 套 数.*?<td>(.*?)</td', con,
                                             re.S | re.M).group(1)
                 co.co_green = re.search('绿 化 率.*?<td>(.*?)</td', con,
                                         re.S | re.M).group(1)
                 co.co_volumetric = re.search('容 积 率.*?<td>(.*?)</td', con,
                                              re.S | re.M).group(1)
                 try:
                     co.co_build_size = re.search('建设规模.*?" >(.*?)平', con,
                                                  re.S | re.M).group(1)
                 except:
                     co.co_build_size = None
                 co.insert_db()
             except Exception as e:
                 log.error('{}小区错误{}'.format(comm_temp, e))
             self.build_parse(co.co_id)
Пример #26
0
    def comm_info(self, co_id):
        comm_url = "http://www.lsjs.gov.cn/WebLSZFGB/LPDetail.aspx?RowGuid=" + co_id
        co_res = requests.get(comm_url, headers=self.headers)
        con = co_res.text
        co = Comm(co_index)
        co.co_name = re.search('楼 盘 名 称:(.*?)<br', con).group(1)
        co.co_id = co_id
        co.area = re.search('所 属 城 区:.*?">(.*?)</span', con).group(1)
        co.co_address = re.search('楼 盘 坐 落:.*?">(.*?)</span', con).group(1)
        co.co_develops = re.search('项 目 公 司:.*?mc">(.*?)</span', con,
                                   re.S | re.M).group(1)
        co.co_pre_sale = re.search('预销售证号.*?">(.*?)</span', con,
                                   re.S | re.M).group(1)
        co.co_all_house = re.search('预售总套数.*?td>(.*?)</td', con,
                                    re.S | re.M).group(1)
        co.co_all_size = re.search('预售总面积.*?td>(.*?)</td', con,
                                   re.S | re.M).group(1)
        co.co_pre_sale_date = re.search('时间.*?">(.*?)</span', con,
                                        re.S | re.M).group(1)
        co.insert_db()

        url = 'http://www.lsjs.gov.cn/WebLSZFGB/Ashx/YSXM.ashx'
        count = 1
        while True:
            data = {
                "method": "getzxl",
                "PageSize": 5,
                "CurrentPageIndex": str(count),
                "YSXMID": co_id,
                # 'Searchkey':''
            }
            res = requests.post(url, data=data, headers=self.headers)
            con_dict = json.loads(res.text)
            num = con_dict["data"][0]['TotalNum']
            info_list = con_dict["data"][1:]
            for info in info_list:
                bu_id = info["YSZID"]
                self.build_info(co_id, bu_id)
            if int(num) < count * 5:
                break
            else:
                count += 1
                continue
Пример #27
0
    def start_crawler(self):
        post_url = 'http://www.wxhouse.com:9097/wwzs/getzxlpxx.action'
        index_res = requests.get(post_url,headers=self.headers)
        page = re.search('page.totalPageCount" value="(\d+)"',index_res.text).group(1)
        for i in range(1,int(page)+1):
            data = {
                'page.currentPageNo':i,
                'page.pageSize': 15,
                'page.totalPageCount': page,
            }
            try:
                res = requests.post(post_url,headers=self.headers,data=data)
                html = etree.HTML(res.content.decode())
            except:
                log.error("翻页请求失败")
                continue
            temp_list = html.xpath("//table//td/a")
            for i in temp_list:
                try:
                    temp_url = i.xpath("./@href")[0]
                    com_url = "http://www.wxhouse.com:9097"+temp_url
                    com_res = requests.get(com_url,headers=self.headers)
                    content = com_res.content.decode()
                    co = Comm(co_index)
                    co.co_id = re.search('id=(\d+)',temp_url).group(1)
                    co.co_name = re.search('项目现定名.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.co_pre_sale = re.search('预(销)售许可证号.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.co_develops = re.search('商:</td>.*?;">(.*?)</a',content,re.S|re.M).group(1)
                    co.co_address = re.search('落:</td>.*?">(.*?)</td',content,re.S|re.M).group(1)
                    area = re.search('行政区:.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.area = area.strip()
                    co.co_land_use = re.search('土地证号:.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.co_plan_pro = re.search('规划许可证号:.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.co_plan_useland = re.search('用地许可证号:.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.co_work_pro = re.search('施工许可证号:.*?">(.*?)</td',content,re.S|re.M).group(1)
                    co.co_all_house = re.search('总套数.*?">(\d+)&nbsp',content,re.S|re.M).group(1)
                    co.insert_db()
                except:
                    log.error("{}小区解析失败".format(com_url))
                    continue

                detail = re.search('楼盘概况.*?href="(.*?)".*?房屋明细',content,re.S|re.M).group(1)
                self.detail_parse(detail,co.co_id)
Пример #28
0
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.lpsfdc.cn/Templets/LPS/aspx/' + i
             content = requests.get(comm_url)
             html = content.text
             co_name_list = re.findall('项目名称:.*?>(.*?)<', html, re.S | re.M)
             co_id_list = re.findall('hdProjectCode" value="(.*?)"', html, re.S | re.M)
             co_develops_list = re.findall('开发企业:.*?>(.*?)<', html, re.S | re.M)
             co_build_size_list = re.findall('TJ_ZMJ">(.*?)<', html, re.S | re.M)
             co_address_list = re.findall('Pro_XMDZ">(.*?)<', html, re.S | re.M)
             co_owner_list = re.findall('Pro_ZZZSBH">(.*?)<', html, re.S | re.M)
             co_pre_sale_list = re.findall('Pro_XKZH">(.*?)<', html, re.S | re.M)
             co_all_house_list = re.findall('TJ_HZYSTS">(.*?)<', html, re.S | re.M)
             for i in range(0, len(co_name_list)):
                 try:
                     comm.co_name = co_name_list[i]
                     comm.co_id = co_id_list[i]
                     comm.co_develops = co_develops_list[i]
                     comm.co_build_size = co_build_size_list[i]
                     comm.co_address = co_address_list[i]
                     comm.co_owner = co_owner_list[i]
                     comm.co_pre_sale = co_pre_sale_list[i]
                     comm.co_all_house = co_all_house_list[i]
                     comm.insert_db()
                     # global count
                     # count += 1
                     # print(count)
                 except Exception as e:
                     print('co_index={}, commiunty error'.format(co_index,), e)
                 build_url_list = re.findall("radiobuild' id='build(.*?)'", html, re.S | re.M)
                 build_name_list = re.findall("radiobuild.*?<span.*?>(.*?)<", html, re.S | re.M)
                 for i in range(0, len(build_url_list)):
                     build = Building(co_index)
                     build.bu_id = build_url_list[i]
                     build.bu_num = build_name_list[i]
                     build.co_id = co_id_list[0]
                     build.insert_db()
                 self.get_build_info(build_url_list)
         except Exception as e:
             print(e)
Пример #29
0
    def comm_info(self, url):
        comm_url = self.start_url + "/" + url
        res = requests.get(comm_url, headers=self.headers)
        res.encoding = 'gbk'
        con = res.text
        co = Comm(co_index)
        co.co_id = re.search('kfsid=(\d+)', url).group(1)
        co.co_name = re.search('itemname.*?">(.*?)</font', con).group(1)
        co.co_develops = re.search('开发商名称:.*?px;">(.*?)</a', con,
                                   re.S | re.M).group(1)
        co.co_all_house = re.search('总套数:.*?">(.*?)&nbsp', con,
                                    re.S | re.M).group(1)
        co.co_all_size = re.search('总面积:.*?">(.*?)&nbsp', con,
                                   re.S | re.M).group(1)
        co.co_residential_size = re.search('>住宅面积:.*?">(.*?)&nbsp', con,
                                           re.S | re.M).group(1)
        co.co_address = re.search('项目座落.*?;">(.*?)</', con,
                                  re.S | re.M).group(1)
        co.area = re.search('所在地区.*?">(.*?)</td', con, re.S | re.M).group(1)
        try:
            co.co_build_size = re.search('建筑面积.*?">(.*?)&nbsp', con,
                                         re.S | re.M).group(1)
            co.co_plan_project = re.search('建设工程规划许可证号.*?">(.*?)<br', con,
                                           re.S | re.M).group(1)
            co.co_land_use = re.search('土地证号.*?">(.*?)<br', con,
                                       re.S | re.M).group(1)
            co.co_work_pro = re.search('建筑工程施工许可证号.*?">(.*?)<br', con,
                                       re.S | re.M).group(1)
            co.co_use = re.search('用途.*?">(.*?)<br', con, re.S | re.M).group(1)
        except:
            co.co_build_size = None
            co.co_plan_project = None
            co.co_land_use = None
            co.co_work_pro = None
            co.co_us = None

        co.insert_db()
        co_html = etree.HTML(con)
        bu_list = co_html.xpath(
            "//table[@id='preselltable1']/tr[@bgcolor='white']")
        self.build_info(bu_list, co.co_id)
Пример #30
0
 def start_crawler(self):
     for i in self.area_list:
         data = {'districtID': i}
         res = requests.post(url='http://www.fangdi.com.cn/complexPro.asp',
                             data=data)
         html_str = res.content.decode('gbk')
         # 根据返回结果 获取每个地区的返回分页
         url_list = re.findall('value="(/complexpro.*?)"', html_str,
                               re.S | re.M)
         for k in url_list:
             response = requests.get('http://www.fangdi.com.cn' + k,
                                     headers=self.headers)
             html = response.content.decode('gbk')
             comm_html = re.search('位置<.*?页/共', html, re.S | re.M).group()
             comm_info_list = re.findall('<tr valign=.*?</tr>', comm_html,
                                         re.S | re.M)
             for info in comm_info_list:
                 try:
                     comm = Comm(co_index)
                     comm_url = re.search('<a href=(.*?)>', info,
                                          re.S | re.M).group(1)
                     comm.co_name = re.search('<a.*?>(.*?)<', info,
                                              re.S | re.M).group(1)
                     comm.co_address = re.search('<a.*?<td.*?>(.*?)<', info,
                                                 re.S | re.M).group(1)
                     comm.co_all_house = re.search(
                         '<a.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.co_all_size = re.search(
                         '<a.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.area = re.search(
                         '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.co_id = re.search('projectID=(.*?)==', info,
                                            re.S | re.M).group(1)
                     self.get_comm_info(comm_url, comm)
                 except Exception as e:
                     print(
                         '小区错误,co_index={},url={}'.format(
                             co_index, 'http://www.fangdi.com.cn' + k), e)